In [None]:
import pandas as pd
import numpy as np
import datetime
from scipy.stats import chi2_contingency, beta
from IPython.display import Image

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Drive/ab_data.csv')

In [None]:
df.sample(2)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
252187,672198,2017-01-11 09:44:41.480039,control,old_page,0
177919,881047,2017-01-12 15:03:37.708172,treatment,new_page,0


In [None]:
df.loc[df.user_id == 798939]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
266003,798939,2017-01-15 03:29:24.918332,control,old_page,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [None]:
start_time = datetime.datetime.strptime(df['timestamp'].min(), '%Y-%m-%d %H:%M:%S.%f')
end_time = datetime.datetime.strptime(df['timestamp'].max(), '%Y-%m-%d %H:%M:%S.%f')
data_duration = (end_time - start_time).days

print(f"Number of unique users in experiment: {df['user_id'].nunique()}")
print(f"Data collected for {data_duration} days")
print(f"Landing pages to compare: {df['landing_page'].unique().tolist()}")
print(f"Percentage of users in control: {round(df[df['group']=='control'].shape[0] * 100 / df.shape[0])}%")

Number of unique users in experiment: 290584
Data collected for 21 days
Landing pages to compare: ['old_page', 'new_page']
Percentage of users in control: 50%


In [None]:
sample = df[df['user_id'].isin([746755,722274])]
# sample = df[df['user_id'] == [746755,722274]]
sample

Unnamed: 0,user_id,timestamp,group,landing_page,converted
29073,746755,2017-01-11 01:28:57.083669,control,new_page,1
105487,722274,2017-01-19 01:46:53.093257,control,old_page,0
262554,722274,2017-01-09 21:21:23.638444,control,new_page,0
286566,746755,2017-01-05 03:40:08.457451,control,old_page,0


In [None]:
sample.groupby('user_id')['timestamp'].min().to_frame().reset_index()

Unnamed: 0,user_id,timestamp
0,722274,2017-01-09 21:21:23.638444
1,746755,2017-01-05 03:40:08.457451


In [None]:
counter = df['user_id'].value_counts()
(counter > 1).value_counts()

False    286690
True       3894
Name: user_id, dtype: int64

In [None]:
counter > 1

805339     True
754884     True
722274     True
783176     True
898232     True
          ...  
642985    False
771499    False
923606    False
712675    False
715931    False
Name: user_id, Length: 290584, dtype: bool

In [None]:
valid_users = pd.DataFrame(counter[counter == 1].index, columns=['user_id'])
df = df.merge(valid_users, on=['user_id'])

In [None]:
# df['week'] = df['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').isocalendar()[1])
# df.sample()

In [None]:
def date_convert(date_to_convert):
     return datetime.datetime.strptime(date_to_convert, '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d')

df['timestamp'] = df['timestamp'].apply(date_convert)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df1 = df.copy()

In [None]:
df1.index = df['timestamp']

In [None]:
df1.index.isocalendar().week

timestamp
2017-01-21    3
2017-01-12    2
2017-01-11    2
2017-01-08    1
2017-01-21    3
             ..
2017-01-03    1
2017-01-12    2
2017-01-22    3
2017-01-15    2
2017-01-16    3
Name: week, Length: 286690, dtype: UInt32

In [None]:
df['week'] = df1.index.isocalendar().week.values

In [None]:
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted,week
0,851104,2017-01-21,control,old_page,0,3
1,804228,2017-01-12,control,old_page,0,2
2,661590,2017-01-11,treatment,new_page,0,2
3,853541,2017-01-08,treatment,new_page,0,1
4,864975,2017-01-21,control,old_page,1,3
...,...,...,...,...,...,...
286685,751197,2017-01-03,control,old_page,0,1
286686,945152,2017-01-12,control,old_page,0,2
286687,734608,2017-01-22,control,old_page,0,3
286688,697314,2017-01-15,control,old_page,0,2


In [None]:
df['week'].value_counts()

2    91380
3    91056
1    83745
4    20509
Name: week, dtype: Int64

In [None]:
NUM_WEEKS = 4 # Vary number to get experiment data at weekly points in time
experiment_data = df[df['week'] <= NUM_WEEKS]
control = experiment_data[experiment_data['group']=='control']
treatment = experiment_data[experiment_data['group']=='treatment']

control_conversion_perc = round(control['converted'].sum() * 100/ control['converted'].count(), 3)
treatment_conversion_perc = round(treatment['converted'].sum() * 100/ treatment['converted'].count(), 3)
lift = round(treatment_conversion_perc - control_conversion_perc, 3)

print(f"Treatment Conversion Rate: {treatment_conversion_perc}%")
print(f"Control Conversion Rate: {control_conversion_perc}%")
print(f"Lift = {lift}%")

Treatment Conversion Rate: 11.873%
Control Conversion Rate: 12.017%
Lift = -0.144%


In [None]:
control['converted'].count()

143293

In [None]:
from scipy.stats import chi2_contingency
#The easiest way to apply a chi-squared test is to compute the #contigency table.
contigency= pd.crosstab(df['group'], df['converted'])
contigency

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,126073,17220
treatment,126372,17025


In [None]:
c, p, dof, expected = chi2_contingency(contigency, correction=False)
print("p_value: ",round(p,3))

p_value:  0.232


**Bayesian Approach**

In [None]:
prior = df[(df['week'] == 1) & (df['group']=='control')]

In [None]:
prior_means = []
for i in range(10000):
    prior_means.append(prior.sample(1000)['converted'].mean())

In [None]:
prior_means[:10]

[0.122, 0.115, 0.101, 0.119, 0.126, 0.116, 0.113, 0.116, 0.114, 0.118]

In [None]:
prior_alpha, prior_beta, _, _ = beta.fit(prior_means, floc=0, fscale=1)

In [None]:
NUM_WEEKS = 4 # Vary number to get experiment data at weekly points in time
experiment_data = df[(df['week'] > 1) & (df['week'] <= NUM_WEEKS)]
control = experiment_data[experiment_data['group']=='control']
treatment = experiment_data[experiment_data['group']=='treatment']

control_conversion_perc = round(control['converted'].sum() * 100/ control['converted'].count(), 3)
treatment_conversion_perc = round(treatment['converted'].sum() * 100/ treatment['converted'].count(), 3)
lift = round((treatment_conversion_perc - control_conversion_perc) / control_conversion_perc , 3)

print(f"Treatment Conversion Rate: {treatment_conversion_perc}%")
print(f"Control Conversion Rate: {control_conversion_perc}%")
print(f"Lift = {lift}%")

Treatment Conversion Rate: 11.909%
Control Conversion Rate: 12.058%
Lift = -0.012%


In [None]:
control_converted = control['converted'].sum()
treatment_converted = treatment['converted'].sum()
control_non_converted = control['converted'].count() - control_converted
treatment_non_converted = treatment['converted'].count() - treatment_converted

In [None]:
# Update Prior parameters with experiment conversion rates
posterior_control = beta(prior_alpha + control_converted, prior_beta + control_non_converted)
posterior_treatment = beta(prior_alpha + treatment_converted, prior_beta + treatment_non_converted)

# Sample from Posteriors
control_samples = posterior_control.rvs(1000)
treatment_samples = posterior_treatment.rvs(1000)
probability = np.mean(treatment_samples > control_samples)
print(f"Probability that treatment > control: {probability * 100}%")



Probability that treatment > control: 15.2%


In [None]:
control_samples