# AB Testing

## Import Library

In [110]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from statsmodels.stats.proportion import proportions_ztest


## Import Data

In [61]:
ab_df = pd.read_csv('https://raw.githubusercontent.com/AnshulSGarg/AB-Testing/main/ab_data.csv')
ab_df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,1/21/2017 22:11,control,old_page,0
1,804228,1/12/2017 8:01,control,old_page,0
2,661590,1/11/2017 16:55,treatment,new_page,0
3,853541,1/8/2017 18:28,treatment,new_page,0
4,864975,1/21/2017 1:52,control,old_page,1
...,...,...,...,...,...
294473,751197,1/3/2017 22:28,control,old_page,0
294474,945152,1/12/2017 0:51,control,old_page,0
294475,734608,1/22/2017 11:45,control,old_page,0
294476,697314,1/15/2017 1:20,control,old_page,0


## Data Validation

In [64]:
ab_df.isna().values.any()

False

There are no missing values in dataset

In [65]:
print('Check if there are duplicates')
count_df = ab_df.groupby('user_id').count().reset_index()
duplicate_df = count_df[count_df['group']>1][['user_id','group']]
duplicate_df

Check if there are duplicates


Unnamed: 0,user_id,group
43,630052,2
111,630126,2
122,630137,2
281,630320,2
420,630471,2
...,...,...
290244,945627,2
290259,945645,2
290315,945703,2
290399,945797,2


In [66]:
ab_df[ab_df['user_id'].isin(duplicate_df['user_id'])].sort_values(by='user_id')

Unnamed: 0,user_id,timestamp,group,landing_page,converted
230259,630052,1/17/2017 1:16,treatment,new_page,0
213114,630052,1/7/2017 12:25,treatment,old_page,1
22513,630126,1/14/2017 13:35,treatment,old_page,0
251762,630126,1/19/2017 17:16,treatment,new_page,0
183371,630137,1/20/2017 2:08,control,old_page,0
...,...,...,...,...,...
142354,945703,1/8/2017 19:40,control,new_page,0
186960,945797,1/13/2017 17:23,control,old_page,0
40370,945797,1/11/2017 3:04,control,new_page,1
165143,945971,1/16/2017 10:09,control,old_page,0


In [67]:
print(7788/294478)

0.026446797383845314


Seems like 3894 users have multiple records and they are exposed to both new and old landing page.
if we exclude these users from the dataset we will loose only 2.6% of total records. 
Loosing 2.6% of data records should be okay in this case. 



In [68]:
ab_df = ab_df[~ab_df['user_id'].isin(duplicate_df['user_id'])]
ab_df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,1/21/2017 22:11,control,old_page,0
1,804228,1/12/2017 8:01,control,old_page,0
2,661590,1/11/2017 16:55,treatment,new_page,0
3,853541,1/8/2017 18:28,treatment,new_page,0
4,864975,1/21/2017 1:52,control,old_page,1
...,...,...,...,...,...
294473,751197,1/3/2017 22:28,control,old_page,0
294474,945152,1/12/2017 0:51,control,old_page,0
294475,734608,1/22/2017 11:45,control,old_page,0
294476,697314,1/15/2017 1:20,control,old_page,0


In [69]:
len(ab_df)

286690

In [70]:
ab_df.groupby('user_id').count().reset_index().sort_values(by='group', ascending=False)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,630000,1,1,1,1
191124,840746,1,1,1,1
191130,840752,1,1,1,1
191129,840751,1,1,1,1
191128,840750,1,1,1,1
...,...,...,...,...,...
95569,735348,1,1,1,1
95570,735349,1,1,1,1
95571,735350,1,1,1,1
95572,735351,1,1,1,1


In [71]:
ab_df.groupby(['group','landing_page']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,converted
group,landing_page,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,old_page,143293,143293,143293
treatment,new_page,143397,143397,143397


After removing duplicates, updated dataset has control group that is exposed to only old_page and treatment group is only exposed to new_page.

In [72]:
ab_df.groupby(['group','converted']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,landing_page
group,converted,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,0,126073,126073,126073
control,1,17220,17220,17220
treatment,0,126372,126372,126372
treatment,1,17025,17025,17025


# Frequentist Test

In [74]:
control_df = ab_df[ab_df['group']=='control']
treatment_df = ab_df[ab_df['group']=='treatment']



In [88]:
control_group_count = len(control_df)
treatment_group_count = len(treatment_df)

print(f'control group count is {control_group_count}')
print(f'treatment group count is {treatment_group_count}')

control group count is 143293
treatment group count is 143397


In [87]:
control_group_conversion_count = len(control_df[control_df['converted'] == 1])
treatment_group_conversion_count = len(treatment_df[treatment_df['converted'] == 1])

print(f'count of control group conversion is {control_group_conversion_count}')
print(f'count of treatment group conversion is {treatment_group_conversion_count}')

count of control group conversion is 17220
count of treatment group conversion is 17025


In [86]:
control_group_conversion_rate = len(control_df[control_df['converted'] == 1])/len(control_df)
treatment_group_conversion_rate = len(treatment_df[treatment_df['converted'] == 1])/len(treatment_df)

print(f'mean of control group conversion is {control_group_conversion_rate}')
print(f'mean of treatment group conversion is {treatment_group_conversion_rate}')

mean of control group conversion is 0.12017335110577627
mean of treatment group conversion is 0.11872633318688676


H0 = There is no signaficant difference b/w mean of control group and treatement group conversion
H1 = mean of treatement group conversion > mean of control group conversion

In [103]:
def two_sample_test_for_proportions(c, t, nc, nt):
    combined_p = (c+t)/(nc+nt)
    # print(f'combined_p is {combined_p}')
    Z = (((t/nt)-(c/nc)) - 0) / (combined_p * (1-combined_p) * ((1/nt)+(1/nc)))**.5
    print(f'Z statistics is {Z}')
    return Z



In [108]:
z_statistic = two_sample_test_for_proportions(control_group_conversion_count,treatment_group_conversion_rate,control_group_count,treatment_group_count)


Z statistics is -135.40066154264042


In [111]:
# Calculate the two-tailed p-value
p_value = 2 * (1 - norm.cdf(abs(z_statistic)))

# Print the result
print(f"P-value is {p_value}")

# Check the significance level (commonly 0.05)
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

P-value is 0.0
Reject the null hypothesis


# p value using statsmodels.stats.proportion library

In [102]:
stat, p_value = proportions_ztest([control_group_conversion_count, treatment_group_conversion_rate], [control_group_count, treatment_group_count])

# Output the results
print(f"Z-statistic: {stat}")
print(f"P-value: {p_value}")

# Check the significance level (commonly 0.05)
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")


Z-statistic: 135.40066154264042
P-value: 0.0
Reject the null hypothesis


In [92]:
count_df = ab_df.groupby('user_id').count().reset_index().sort_values(by='group', ascending=False)
count_df


Unnamed: 0,user_id,timestamp,group,landing_page,converted
91655,729669,2,2,2,2
179531,825266,2,2,2,2
30345,663033,2,2,2,2
41348,674959,2,2,2,2
225511,875210,2,2,2,2
...,...,...,...,...,...
97535,736032,1,1,1,1
97536,736033,1,1,1,1
97537,736034,1,1,1,1
97538,736035,1,1,1,1


In [104]:
count_df[count_df['group']==1]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
191971,838798,1,1,1,1
194744,841817,1,1,1,1
191967,838794,1,1,1,1
194249,841281,1,1,1,1
194241,841273,1,1,1,1
...,...,...,...,...,...
97535,736032,1,1,1,1
97536,736033,1,1,1,1
97537,736034,1,1,1,1
97538,736035,1,1,1,1


In [93]:
len(count_df[count_df['group']==2])/len(count_df)

0.013400600170690747

In [94]:
len(count_df)

290584

In [117]:
ab_df = ab_df.drop_duplicates(subset=['user_id'],keep=False)
ab_df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,1/21/17 10:11 PM,control,old_page,0
1,804228,1/12/17 8:01 AM,control,old_page,0
2,661590,1/11/17 4:55 PM,treatment,new_page,0
3,853541,1/8/17 6:28 PM,treatment,new_page,0
4,864975,1/21/17 1:52 AM,control,old_page,1
...,...,...,...,...,...
294473,751197,1/3/17 10:28 PM,control,old_page,0
294474,945152,1/12/17 12:51 AM,control,old_page,0
294475,734608,1/22/17 11:45 AM,control,old_page,0
294476,697314,1/15/17 1:20 AM,control,old_page,0


In [119]:
ab_df = ab_df[~(ab_df['group']=='control') | ~(ab_df['landing_page']=='new_page')]
ab_df = ab_df[~(ab_df['group']=='treatment') | ~(ab_df['landing_page']=='old_page')]
ab_df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,1/21/17 10:11 PM,control,old_page,0
1,804228,1/12/17 8:01 AM,control,old_page,0
2,661590,1/11/17 4:55 PM,treatment,new_page,0
3,853541,1/8/17 6:28 PM,treatment,new_page,0
4,864975,1/21/17 1:52 AM,control,old_page,1
...,...,...,...,...,...
294473,751197,1/3/17 10:28 PM,control,old_page,0
294474,945152,1/12/17 12:51 AM,control,old_page,0
294475,734608,1/22/17 11:45 AM,control,old_page,0
294476,697314,1/15/17 1:20 AM,control,old_page,0


In [120]:
ab_df.groupby('group').count()

Unnamed: 0_level_0,user_id,timestamp,landing_page,converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,143293,143293,143293,143293
treatment,143397,143397,143397,143397


In [121]:
old_conversion = len(ab_df[(ab_df['group']=='control') & (ab_df['converted']==1)])/len(ab_df[(ab_df['group']=='control')])
old_conversion

0.12017335110577627

In [122]:
new_conversion = len(ab_df[(ab_df['group']=='treatment') & (ab_df['converted']==1)])/len(ab_df[(ab_df['group']=='treatment')])
new_conversion

0.11872633318688676

In [124]:
lift = (new_conversion-old_conversion)/old_conversion
lift

-0.001447017918889515

In [125]:
# ab_df['timestamp'] = pd.to_datetime(ab_df['timestamp'], format='%m/%d-%Y hh:mm AM')
ab_df['timestamp'] = pd.to_datetime(ab_df['timestamp'])
# Extract the week from the 'DateColumn' and create a new column 'Week'
ab_df['Week'] = ab_df['timestamp'].dt.isocalendar().week

# Display the DataFrame with the 'Week' column
ab_df

  ab_df['timestamp'] = pd.to_datetime(ab_df['timestamp'])


Unnamed: 0,user_id,timestamp,group,landing_page,converted,Week
0,851104,2017-01-21 22:11:00,control,old_page,0,3
1,804228,2017-01-12 08:01:00,control,old_page,0,2
2,661590,2017-01-11 16:55:00,treatment,new_page,0,2
3,853541,2017-01-08 18:28:00,treatment,new_page,0,1
4,864975,2017-01-21 01:52:00,control,old_page,1,3
...,...,...,...,...,...,...
294473,751197,2017-01-03 22:28:00,control,old_page,0,1
294474,945152,2017-01-12 00:51:00,control,old_page,0,2
294475,734608,2017-01-22 11:45:00,control,old_page,0,3
294476,697314,2017-01-15 01:20:00,control,old_page,0,2


In [126]:
ab_df.groupby('Week').count()

Unnamed: 0_level_0,user_id,timestamp,group,landing_page,converted
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,83745,83745,83745,83745,83745
2,91380,91380,91380,91380,91380
3,91056,91056,91056,91056,91056
4,20509,20509,20509,20509,20509


In [127]:
ab_df[(ab_df['group']=='control') & (ab_df['converted']==1)]

Unnamed: 0,user_id,timestamp,group,landing_page,converted,Week
4,864975,2017-01-21 01:52:00,control,old_page,1,3
15,644214,2017-01-22 02:05:00,control,old_page,1,3
28,913579,2017-01-24 09:11:00,control,old_page,1,4
36,831737,2017-01-11 21:18:00,control,old_page,1,2
43,862225,2017-01-08 14:49:00,control,old_page,1,1
...,...,...,...,...,...,...
294383,728029,2017-01-11 11:17:00,control,old_page,1,2
294385,850065,2017-01-17 11:57:00,control,old_page,1,3
294405,712217,2017-01-11 10:34:00,control,old_page,1,2
294420,795742,2017-01-09 01:06:00,control,old_page,1,2


In [137]:
control_converted = len(ab_df[(ab_df['group']=='control') & (ab_df['converted']==1)])
control_not_converted = len(ab_df[(ab_df['group']=='control') & ~(ab_df['converted']==1)])
treatement_converted = len(ab_df[(ab_df['group']=='treatment') & (ab_df['converted']==1)])
treatement_not_converted = len(ab_df[(ab_df['group']=='treatment') & ~(ab_df['converted']==1)])


In [138]:
np.array([[control_converted,control_not_converted],[treatement_converted,treatement_not_converted]])

array([[ 17220, 126073],
       [ 17025, 126372]])