In [1]:
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime
from scipy.stats import ttest_ind

# A/B test analysis

## Assignments

In [2]:
data = pd.read_csv('assignments.csv')
data.head()

Unnamed: 0,userid,ts,groupid
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0


In [3]:
print(datetime.strptime(data.head(1)['ts'][0],'%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d'))

2021-11-02


In [4]:
data['dt'] = data['ts'].map(lambda x: datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d'))
data

Unnamed: 0,userid,ts,groupid,dt
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0,2021-11-02
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0,2021-11-13
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0,2021-11-20
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0,2021-11-20
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0,2021-11-04
...,...,...,...,...
59995,5ad7285b-0aa2-49e0-b8e6-c727ef48b9bc,2021-11-02T12:00:13Z,1,2021-11-02
59996,f032f15e-775b-4a78-8566-f513f3b84192,2021-11-09T19:13:58Z,1,2021-11-09
59997,1da8e056-a5ba-4242-8026-6cc9e6f033c0,2021-11-19T00:23:10Z,1,2021-11-19
59998,e4407b37-66b6-4683-af6b-1b54a74556b5,2021-11-16T16:28:41Z,1,2021-11-16


In [5]:
data.groupby('groupid').describe()

Unnamed: 0_level_0,userid,userid,userid,userid,ts,ts,ts,ts,dt,dt,dt,dt
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,29951,29951,c5d77c89-33a3-4fe3-9e31-179dec09d49c,1,29951,29722,2021-11-09T09:48:35Z,2,29951,20,2021-11-09,1541
1,30049,30049,2df36760-7452-47de-ad12-03fece7c9d1e,1,30049,29813,2021-11-07T15:50:11Z,3,30049,20,2021-11-20,1557


In [6]:
data_ass = data.groupby(['groupid','dt']).count().reset_index()
data_ass.head()

Unnamed: 0,groupid,dt,userid,ts
0,0,2021-11-01,1497,1497
1,0,2021-11-02,1467,1467
2,0,2021-11-03,1532,1532
3,0,2021-11-04,1509,1509
4,0,2021-11-05,1503,1503


In [7]:
alt.Chart(data_ass).mark_line(size=3).encode(
    alt.X('dt:T', axis=alt.Axis(title='date')),
    alt.Y('userid:Q', axis=alt.Axis(title='number of users')),
    tooltip=['userid'],
    color='groupid:N'
    
).properties(width=600, height=400)


## Pre-test metrics

### User activity

In [8]:
data_act = pd.read_csv('activity_all.csv')

In [9]:
data_act.head()

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0


In [10]:
act = data_act.query('activity_level>0').groupby(['groupid','dt']).count().reset_index()
act

Unnamed: 0,groupid,dt,userid,activity_level
0,0,2021-10-01,15337,15337
1,0,2021-10-02,15354,15354
2,0,2021-10-03,15423,15423
3,0,2021-10-04,15211,15211
4,0,2021-10-05,15126,15126
...,...,...,...,...
117,1,2021-11-26,29303,29303
118,1,2021-11-27,29350,29350
119,1,2021-11-28,29273,29273
120,1,2021-11-29,29289,29289


In [11]:
act.groupby('groupid').describe()

Unnamed: 0_level_0,userid,userid,userid,userid,userid,userid,userid,userid,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,61.0,15547.655738,353.023412,15126.0,15285.0,15356.0,15990.0,16147.0,61.0,15547.655738,353.023412,15126.0,15285.0,15356.0,15990.0,16147.0
1,61.0,22213.131148,7032.196012,15202.0,15343.0,15531.0,29300.0,29382.0,61.0,22213.131148,7032.196012,15202.0,15343.0,15531.0,29300.0,29382.0


In [12]:
alt.Chart(act).mark_line(size=3).encode(
    alt.X('dt:T', axis=alt.Axis(title='date')),
    alt.Y('userid:Q', axis=alt.Axis(title='number of users')),
    tooltip=['userid'],
    color='groupid:N'
    
).properties(width=600, height=400)

In [13]:
act.query('dt< "2021-11-01" ').groupby('groupid').describe()

Unnamed: 0_level_0,userid,userid,userid,userid,userid,userid,userid,userid,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,31.0,15320.870968,89.887241,15126.0,15277.0,15335.0,15364.0,15512.0,31.0,15320.870968,89.887241,15126.0,15277.0,15335.0,15364.0,15512.0
1,31.0,15352.516129,86.536648,15202.0,15294.0,15343.0,15421.0,15531.0,31.0,15352.516129,86.536648,15202.0,15294.0,15343.0,15421.0,15531.0


In [14]:
act.query('dt>= "2021-11-01" ').groupby('groupid').describe()

Unnamed: 0_level_0,userid,userid,userid,userid,userid,userid,userid,userid,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,30.0,15782.0,371.077276,15163.0,15335.0,15990.5,16045.0,16147.0,30.0,15782.0,371.077276,15163.0,15335.0,15990.5,16045.0,16147.0
1,30.0,29302.433333,30.417422,29255.0,29280.0,29300.0,29321.0,29382.0,30.0,29302.433333,30.417422,29255.0,29280.0,29300.0,29321.0,29382.0


### Comparing the activity between the groups

In [15]:
g0_bf = act.query('groupid == 0 and dt< "2021-11-01" ')
g0_bf.head()

Unnamed: 0,groupid,dt,userid,activity_level
0,0,2021-10-01,15337,15337
1,0,2021-10-02,15354,15354
2,0,2021-10-03,15423,15423
3,0,2021-10-04,15211,15211
4,0,2021-10-05,15126,15126


In [16]:
g0_af = act.query('groupid == 0 and dt>= "2021-11-01" ')
g0_af.head()

Unnamed: 0,groupid,dt,userid,activity_level
31,0,2021-11-01,15989,15989
32,0,2021-11-02,16024,16024
33,0,2021-11-03,16049,16049
34,0,2021-11-04,16040,16040
35,0,2021-11-05,16045,16045


In [17]:
g1_bf = act.query('groupid == 1 and dt< "2021-11-01" ')
g1_bf.head()

Unnamed: 0,groupid,dt,userid,activity_level
61,1,2021-10-01,15297,15297
62,1,2021-10-02,15421,15421
63,1,2021-10-03,15362,15362
64,1,2021-10-04,15388,15388
65,1,2021-10-05,15462,15462


In [18]:
g1_af = act.query('groupid == 1 and dt>= "2021-11-01" ')
g1_af.head()

Unnamed: 0,groupid,dt,userid,activity_level
92,1,2021-11-01,29318,29318
93,1,2021-11-02,29289,29289
94,1,2021-11-03,29306,29306
95,1,2021-11-04,29267,29267
96,1,2021-11-05,29336,29336


By the activity levels

In [19]:
res_af_act = ttest_ind(g0_af['activity_level'].to_numpy(),
                       g1_af['activity_level'].to_numpy()
).pvalue
res_af_act = "{:.10f}".format(res_af_act)
res_af_act

'0.0000000000'

Checking for the pretest bias on activity.

In [20]:
res_bf_act = ttest_ind(g0_bf['activity_level'].to_numpy(),
                       g1_bf['activity_level'].to_numpy()
).pvalue
bf_act = "{:.10f}".format(res_bf_act)
bf_act

'0.1630842354'

By the number of active users

In [21]:
res_af_usr = ttest_ind(g0_af['userid'].to_numpy(),
                       g1_af['userid'].to_numpy()
).pvalue
af_usr = "{:.10f}".format(res_af_usr)
af_usr

'0.0000000000'

Checking for the pretest bias on the numbe of users

In [22]:
res_bf_usr = ttest_ind(g0_bf['userid'].to_numpy(),
                       g1_bf['userid'].to_numpy()
).pvalue
bf_usr = "{:.10f}".format(res_bf_usr)
bf_usr

'0.1630842354'

### Click through rate (CTR)

In [23]:
data_ctr = pd.read_csv("ctr_all.csv")

In [24]:
data_ctr.head()

Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95


In [42]:
ctr = data_ctr.groupby(['groupid','dt']).mean().reset_index()

In [43]:
alt.Chart(data_ctr_avg).mark_line(size=5).encode(
    alt.X('dt'),
    alt.Y('ctr'),
    color='groupid:N',
    tooltip=['ctr']
).properties(
    width=600,
    height=400
)

In [44]:
c_g0_bf = ctr.query('groupid == 0 and dt< "2021-11-01" ')
c_g0_bf.head()

Unnamed: 0,groupid,dt,ctr
0,0,2021-10-01,32.980627
1,0,2021-10-02,33.004056
2,0,2021-10-03,33.002006
3,0,2021-10-04,32.990363
4,0,2021-10-05,33.014167


In [45]:
c_g0_af = ctr.query('groupid == 0 and dt>= "2021-11-01" ')
c_g0_af.head()

Unnamed: 0,groupid,dt,ctr
31,0,2021-11-01,32.982671
32,0,2021-11-02,33.014983
33,0,2021-11-03,33.008268
34,0,2021-11-04,32.986679
35,0,2021-11-05,33.004766


In [46]:
c_g1_bf = ctr.query('groupid == 1 and dt< "2021-11-01" ')
c_g1_bf.head()

Unnamed: 0,groupid,dt,ctr
61,1,2021-10-01,33.006299
62,1,2021-10-02,32.979326
63,1,2021-10-03,32.988139
64,1,2021-10-04,32.995596
65,1,2021-10-05,32.994796


In [47]:
c_g1_af = ctr.query('groupid == 1 and dt>= "2021-11-01" ')
c_g1_af.head()

Unnamed: 0,groupid,dt,ctr
92,1,2021-11-01,37.994619
93,1,2021-11-02,38.013656
94,1,2021-11-03,37.995562
95,1,2021-11-04,37.988512
96,1,2021-11-05,38.002816


In [51]:
c_res_af_act = ttest_ind(c_g0_af['ctr'].to_numpy(),
                       c_g1_af['ctr'].to_numpy()
).pvalue
c_res_af_act = "{:.10f}".format(c_res_af_act)
c_res_af_act

'0.0000000000'

Checking for the pretest bias on ctr.

In [50]:
c_res_bf_act = ttest_ind(c_g0_bf['ctr'].to_numpy(),
                       c_g1_bf['ctr'].to_numpy()
).pvalue
c_bf_act = "{:.10f}".format(c_res_bf_act)
c_bf_act

'0.7042806646'