In [39]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [40]:
#Simulating Click Data for a/b Testing
N_exp = 10000
N_con = 10000

In [72]:
#Generate click data
click_exp = pd.Series(np.random.binomial(1, 0.5, size=N_exp))
click_con = pd.Series(np.random.binomial(1, 0.2, size=N_con))

In [73]:
#Generate Group Identifier
group_exp = pd.Series(np.repeat('exp', N_exp))
group_con = pd.Series(np.repeat('con', N_con))

df_exp = pd.concat([click_exp, group_exp], axis=1)
df_con = pd.concat([click_con, group_con], axis=1)

df_exp.columns = ['click', 'group']
df_con.columns = ['click', 'group']

print(df_exp)
print(df_con)

      click group
0         0   exp
1         1   exp
2         1   exp
3         0   exp
4         0   exp
...     ...   ...
9995      1   exp
9996      0   exp
9997      0   exp
9998      0   exp
9999      0   exp

[10000 rows x 2 columns]
      click group
0         1   con
1         0   con
2         0   con
3         0   con
4         0   con
...     ...   ...
9995      1   con
9996      0   con
9997      0   con
9998      0   con
9999      0   con

[10000 rows x 2 columns]


In [74]:
df_ab_test = pd.concat([df_exp, df_con], axis=0).reset_index(drop=True)
df_ab_test

Unnamed: 0,click,group
0,0,exp
1,1,exp
2,1,exp
3,0,exp
4,0,exp
...,...,...
19995,1,con
19996,0,con
19997,0,con
19998,0,con


In [75]:
X_con = df_ab_test.groupby("group")["click"].sum().loc["con"]
X_exp = df_ab_test.groupby("group")["click"].sum().loc["exp"]

In [76]:
print(df_ab_test.groupby("group")["click"].sum())

group
con    1971
exp    5048
Name: click, dtype: int64


In [77]:
print("Number of clicks in Control:", X_con)
print("Number of clicks in Experimental:", X_exp)

Number of clicks in Control: 1971
Number of clicks in Experimental: 5048


In [78]:
p_con_hat = X_con / N_con
p_exp_hat = X_exp / N_exp

print("Estimated click probability in Control:", p_con_hat)
print("Estimated click probability in Experimental:", p_exp_hat)

Estimated click probability in Control: 0.1971
Estimated click probability in Experimental: 0.5048


In [79]:
p_pooled_hat = (X_con + X_exp) / (N_con + N_exp)
pooled_variance = p_pooled_hat * (1 - p_pooled_hat) * (1 / N_con + 1 / N_exp)

print("Pooled p^:", p_pooled_hat)
print("Pooled Variance:", pooled_variance)

Pooled p^: 0.35095
Pooled Variance: 4.55568195e-05


In [80]:
se = np.sqrt(pooled_variance)
print("Standard Error:", se)

Standard Error: 0.006749579209106298


In [81]:
Test_stat = (p_con_hat - p_exp_hat) / se
print("Test Statistic for 2 sample Z-test:",Test_stat)

Test Statistic for 2 sample Z-test: -45.5880271150625


In [82]:
alpha = 0.05
print("Significance Level Alpha:", alpha)

Significance Level Alpha: 0.05


In [83]:
Z_crit = stats.norm.ppf(1 - alpha / 2)
print("Z-critical Value from standard normal distribution:", Z_crit)

Z-critical Value from standard normal distribution: 1.959963984540054


In [84]:
p_value = 2 * stats.norm.sf(abs(Test_stat))
print("P-value of 2 sample Z-test:", round(p_value, 3))

P-value of 2 sample Z-test: 0.0


In [89]:
ci = [round((p_exp_hat-p_con_hat) - se*Z_crit, 3), round((p_exp_hat-p_con_hat) + se*Z_crit, 3)]
print("Confidence Interval of 2 sample Z-test is:", ci)

Confidence Interval of 2 sample Z-test is: [0.294, 0.321]


In [90]:
delta = X_exp / N_exp - X_con / N_con
delta

0.30770000000000003