In [129]:
import pandas as pd
df = pd.read_csv('ab_data.csv')

In [130]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [131]:
len(df)

294478

In [132]:
df['user_id'].nunique()

290584

In [133]:
df.converted.mean()


0.11965919355605512

In [134]:
df.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [136]:
(df['landing_page'] == 'new_page').value_counts()

True     147239
False    147239
Name: landing_page, dtype: int64

In [137]:
(df['group'] == 'treatment').value_counts()

True     147276
False    147202
Name: group, dtype: int64

There are clearly cases where either the page (old page or new page) or the group (control vs treatment) is mislabelled

In [138]:
len(df[((df['group'] == 'treatment') == (df['landing_page'] == 'new_page')) == False])

3893

In [139]:
df[((df['group'] == 'treatment') == (df['landing_page'] == 'new_page')) == False].index

Int64Index([    22,    240,    308,    327,    357,    490,    685,    713,
               776,    846,
            ...
            293817, 293888, 293894, 293917, 293996, 294014, 294200, 294252,
            294253, 294331],
           dtype='int64', length=3893)

In [140]:
to_drop = df[((df['group'] == 'treatment') == (df['landing_page'] == 'new_page')) == False].index

In [141]:
df = df.drop(to_drop)

In [142]:
# Ensuring we have dropped all mislabelled rows
df[((df['group'] == 'treatment') == (df['landing_page'] == 'new_page')) == False]


Unnamed: 0,user_id,timestamp,group,landing_page,converted


In [143]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [164]:
# Check for duplicates
df.user_id.duplicated().value_counts()

False    290584
Name: user_id, dtype: int64

In [156]:
df[df.duplicated(['user_id'], keep=False)]


Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [163]:
# drop duplicate

df.drop(df[df.duplicated(['user_id'])].index, inplace = True)


In [162]:
# Prob of conversion regardless of the page

len(df.loc[df['converted'] == 1])/ len(df)

0.11959708724499628

In [167]:
# Prob of conversion in control group

len(df.loc[(df['converted'] == 1) & (df['group'] == 'control')])/len(df.loc[df['group'] == 'control'])

0.1203863045004612

In [168]:
# Prob of conversion in treatment group

len(df.loc[(df['converted'] == 1) & (df['group'] == 'treatment')])/len(df.loc[df['group'] == 'treatment'])

0.11880806551510564

In [170]:
# Prob of receiving new page

len(df.loc[df['landing_page'] == 'new_page'])/len(df)

0.5000619442226688

In [171]:
# From the above analysis it seems as though the new page leads to a lower conversion rate

## A/B Test

In [172]:
# H0 = conversion of treatment group <= conversion of control group 
# H1 = conversion of treatment group > conversion of control group


In [174]:
treatment_df = df.loc[df['group'] == 'treatment']
control_df = df.loc[df['group'] == 'control']

In [175]:
#Probability of conversion assuming null is true
P_new = treatment_df['converted'].mean()
P_new

0.11880806551510564

In [176]:
P_old = control_df['converted'].mean()
P_old

0.1203863045004612

In [186]:
# number in each group
n_new = len(treatment_df)
n_new

145310

In [185]:
n_old = len(control_df)
n_old

145274

In [181]:
import statsmodels.api as sm


In [191]:
control_conversions =  len(control_df.loc[df['converted'] == 1])
control_conversions

17489

In [192]:
treatment_conversions =  len(treatment_df.loc[df['converted'] == 1])
treatment_conversions

17264

In [196]:
z_score, p_value = sm.stats.proportions_ztest([convert_old, convert_new], [n_old, n_new], alternative='smaller')
z_score, p_value

(1.3109241984234394, 0.9050583127590245)