In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure that we get the same answers on quizzes as we set up
random.seed(42)

In [None]:
# we use shape function to see number of rows [first element]
row_num = df.shape[0]
print("Number of rows is: {}".format(row_num))

In [None]:
#use unique() function
user_total = df.nunique()['user_id']
print("Number of unique users is : {}".format(user_total))

In [None]:
# we can find proportion of users converted by taking mean since values are 1 and 0
print("Converted users proportion is {}%".format((df['converted'].mean())*100))

In [None]:
# alternate method to find number of converted users 
sum(df['converted'].values)/row_num

In [None]:
# rows where treatment group user lands incorrectly on old_page 
mismatch_grp1 = df.query("group == 'treatment' and landing_page == 'old_page'")
print("Times treatment group user lands incorrectly on old_page is {}".format(len(mismatch_grp1)))

# rows where control group user incorrectly lands on new_page
mismatch_grp2 = df.query("group == 'control' and landing_page == 'new_page'")
print("Times control group user incorrectly lands on new_page is {}".format(len(mismatch_grp2)))

#  number of times the new_page and treatment don't line up is sum of above two values
print("Times new_page and treatment don't line up is {}".format(len(mismatch_grp1) + len(mismatch_grp2)))

In [None]:
# we check number of values in each rows using info function
# entry values denote if any column has missing values
df.info()

In [None]:
# Delete Rows
# drop rows for mismatched treatment groups
df.drop(df.query("group == 'treatment' and landing_page == 'old_page'").index, inplace=True)
# drop rows for mismatched control groups
df.drop(df.query("group == 'control' and landing_page == 'new_page'").index, inplace=True)

In [None]:
# save new clean dataset which contains no duplicates or records with missing or mismatched values
# we will use this dataset in next sections
df.to_csv('ab_edited.csv', index=False)

In [None]:
# read newly created dataset into another dataframe
df2 = pd.read_csv('ab_edited.csv')

# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

In [None]:
# unique user ids count is
len(df2['user_id'].unique())

In [None]:
# check if duplicates in user_id
# we know that one user id is repeated due to difference between #userids and #unique ids
sum(df2['user_id'].duplicated())

In [None]:
# inspect duplicate userid
df2[df2.duplicated(['user_id'], keep=False)]['user_id']

In [None]:
# since values are 1 and 0, we can calculate mean to get probability of an individual converting 
df['converted'].mean()

In [None]:
# for this we group by column 'group'
# then we compute the statistics using describe function
# as conversions are assigned boolean values, we can use mean to find probability of conversion

df_grp = df.groupby('group')
df_grp.describe()

In [None]:
# number of individuals who got new page is same as those in treatment group
new_user = len(df.query("group == 'treatment'"))

# calculate total number of users
users=df.shape[0]

# thus, probability that an individual received the new page is new_user/users
new_user_p = new_user/users
print(new_user_p)
0.5000636646764286

In [None]:
Evidence that one page leads to more conversions?

Given that an individual was in the treatment group, the probability they converted is 0.118807
Given that an individual was in the control group, the probability they converted is 0.120386
We find that old page does better, but by a very tiny margin.
Change aversion, test span durations and other potentially influencing factors are not accounted for. So, we cannot state with certainty that one page leads to more conversions. This is even more important due to almost similar perforamnce of both pages.

In [None]:
p_new = df2['converted'].mean()
print(p_new)
0.11959708724499628

In [None]:
p_old = df2['converted'].mean()
print(p_old)
0.11959708724499628

In [None]:
n_new = len(df2.query("group == 'treatment'"))
print(n_new)
145310

In [None]:
n_old = len(df2.query("group == 'control'"))
print(n_old)
145274

In [None]:
new_page_converted = np.random.choice([1, 0], size=n_new, p=[p_new, (1-p_new)])
# print(len(new_page_converted)) #code to check values

In [None]:
old_page_converted = np.random.choice([1, 0], size=n_old, p=[p_old, (1-p_old)])
# print(len(old_page_converted))  #code to check values

In [None]:
# since new_page_converted and old_page_converted have different sizes, we cannot directly compute p_diff
# since, differernce is only 36 values of thousands, we truncate the excess in new_page_converted
new_page_converted = new_page_converted[:145274]

In [None]:
p_diff = (new_page_converted/n_new) - (old_page_converted/n_old)
# print(p_diff) #code to check values

In [None]:
p_diffs = []

for _ in range(10000):
    new_page_converted = np.random.choice([1, 0], size=n_new, p=[p_new, (1-p_new)]).mean()
    old_page_converted = np.random.choice([1, 0], size=n_old, p=[p_old, (1-p_old)]).mean()
    diff = new_page_converted - old_page_converted 
    p_diffs.append(diff)

In [None]:
plt.hist(p_diffs)
plt.xlabel('p_diffs')
plt.ylabel('Frequency')
plt.title('Plot of 10K simulated p_diffs');

In [None]:
# compute difference from original dataset ab_data.csv
act_diff = df[df['group'] == 'treatment']['converted'].mean() -  df[df['group'] == 'control']['converted'].mean()
act_diff
Out[35]:
-0.0015790565976871451

In [None]:
p_diffs = np.array(p_diffs)
p_diffs

In [None]:
# proportion of p_diffs greater than the actual difference observed in ab_data.csv is computed as:
(act_diff < p_diffs).mean()
Out[37]:
0.90290000000000004

k. In words, explain what you just computed in part j.. What is this value called in scientific studies? What does this value mean in terms of whether or not there is a difference between the new and old pages?

Answer:
We are computing p-values here.
As explained in the videos and quizzes, this is the probability of observing our statistic (or one more extreme in favor of the alternative) if the null hypothesis is true.
The more extreme in favor of the alternative portion of this statement determines the shading associated with your p-value.
Here, we find that there is no conversion advantage with new pages. We conclude that null hypothesis is true as old and new pages perform almost similarly. Old pages, as the numbers show, performed slightly better.

In [None]:
convert_old = sum(df2.query("group == 'control'")['converted'])
convert_new = sum(df2.query("group == 'treatment'")['converted'])
n_old = len(df2.query("group == 'control'"))
n_new = len(df2.query("group == 'treatment'"))

#print(convert_old, convert_new, n_old, n_new)  #code to test if values generated correctly

In [None]:
z_score, p_value = sm.stats.proportions_ztest([convert_old, convert_new], [n_old, n_new], alternative='smaller')
print(z_score, p_value)
1.31092419842 0.905058312759

In [None]:
from scipy.stats import norm

print(norm.cdf(z_score))
# Tells us how significant our z-score is

# for our single-sides test, assumed at 95% confidence level, we calculate: 
print(norm.ppf(1-(0.05)))
# Tells us what our critical value at 95% confidence is 
# Here, we take the 95% values as specified in PartII.1
0.905058312759
1.64485362695

In [None]:
Answer:

We find that the z-score of 1.31092419842 is less than the critical value of 1.64485362695. So, we accept the null hypothesis.
As regards the conversion rates of the old and new pages, we find that old pages are only minutely better than new pages.
These values agree with the findings in parts j. and k.