# simulating click data for A/B Testing

In [1]:
# generate data
import numpy as np
import pandas as pd
from scipy.stats import norm

N_exp = 10000
N_con = 10000

# generating click data
click_exp = pd.Series(np.random.binomial(1,0.4,size = N_exp))
click_con = pd.Series(np.random.binomial(1,0.2,size = N_con))

# generate grop indentifier
exp_id = pd.Series(np.repeat("exp", N_exp))
con_id = pd.Series(np.repeat("con", N_con))

df_exp = pd.concat([click_exp,exp_id], axis = 1)
df_con = pd.concat([click_con,con_id], axis = 1)

df_exp.columns = ["click","group"]
df_con.columns = ["click","group"]

df_ab_test = pd.concat([df_exp,df_con], axis = 0).reset_index(drop=True)
df_ab_test.head()

Unnamed: 0,click,group
0,0,exp
1,1,exp
2,1,exp
3,1,exp
4,1,exp


In [2]:
# calculate p hat for exp and con groups
X_con = df_ab_test.groupby("group")["click"].sum().loc["con"]
X_exp = df_ab_test.groupby("group")["click"].sum().loc["exp"]
print("Number of clicks in control:", X_con)
print("Number of clicks in experimental:", X_exp)

p_con_hat = X_con/N_con
p_exp_hat = X_exp/N_exp
print("Click probability in control group:", p_con_hat)
print("Click probability in experimental group:", p_exp_hat)

Number of clicks in control: 1947
Number of clicks in experimental: 3957
Click probability in control group: 0.1947
Click probability in experimental group: 0.3957


In [3]:
# p pooled 
p_pooled_hat = (X_con+X_exp)/(N_con+N_exp)

# pooled variance
p_pooled_var = p_pooled_hat * (1-p_pooled_hat) * (1/N_con + 1/N_exp)

# standard error
SE = np.sqrt(p_pooled_var)

print("p^_pooled is:", p_pooled_hat)
print("pooled variance is:", p_pooled_var)
print("standard error is:", SE)

p^_pooled is: 0.2952
pooled variance is: 4.1611392000000004e-05
standard error is: 0.006450689265497138


In [4]:
# test statistics
test_stat = (p_con_hat - p_exp_hat)/SE
print("Test statistics for 2-sample Z-test is:", test_stat)

Test statistics for 2-sample Z-test is: -31.15946090832348


In [5]:
# check the significance
alpha = 0.05
print("Alpha: significance level is:", alpha)

Z_crit = norm.ppf(1-alpha/2)
print("Z-critical value from standard normal distribution is:", Z_crit)

# p-value
p_value = 2 * norm.sf(abs(test_stat))
print("P-valur of the 2-sample Z-test is:", round(p_value,2))

Alpha: significance level is: 0.05
Z-critical value from standard normal distribution is: 1.959963984540054
P-valur of the 2-sample Z-test is: 0.0


## since p-value < 0.05 we can conclude that null hypothesis can be rejected. that is there is a statistical significance between the experimental vesrion of the product and  the control version of the product