In [38]:
## using dataset from TotallyMoney
import pandas as pd
import numpy as np
from scipy.stats import norm
import statsmodels.stats.proportion as ssp
import scipy.stats as stats
from plotly import graph_objects as go

import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)

visits = pd.read_csv('/Users/emilee/Documents/TotallyMoney_Data_Scientist_Technical_Test/data/visits.csv')
variations = pd.read_csv('/Users/emilee/Documents/TotallyMoney_Data_Scientist_Technical_Test/data/variations.csv')
conversions = pd.read_csv('/Users/emilee/Documents/TotallyMoney_Data_Scientist_Technical_Test/data/test_conversions.csv')
email = pd.read_csv('/Users/emilee/Documents/TotallyMoney_Data_Scientist_Technical_Test/data/email_engagement.csv')

In [39]:
## For A/B Testing: Merge together Visits, Variations, and Test conversions datsets on user_id
ABTest_data = pd.merge(pd.merge(visits,variations,on='user_id', how='outer'),conversions,on='user_id', how='outer')

## data cleaning, convert the null values to zero in the converted column (where users did not convert)
ABTest_data['converted'] = ABTest_data.converted.apply(lambda x: 1 if x == 1 else 0)

In [40]:
ABTest_data.head()

Unnamed: 0,user_id,visit_time,channel,age,income,gender,variation,converted
0,4681,2018-01-06 15:00:00,Facebook,45.0,44000.0,female,control,1
1,9052,2018-08-23 12:00:00,Facebook,,35000.0,female,treatment,0
2,9579,2018-07-28 12:00:00,Facebook,41.0,42000.0,male,treatment,0
3,2601,2018-12-17 00:00:00,PPC,35.0,34000.0,male,treatment,0
4,7136,2018-09-27 21:00:00,PPC,,32000.0,male,control,0


In [41]:
ABdata = ABTest_data[['user_id','variation','converted']]

In [42]:
ABdata.head()

Unnamed: 0,user_id,variation,converted
0,4681,control,1
1,9052,treatment,0
2,9579,treatment,0
3,2601,treatment,0
4,7136,control,0


In [43]:
ABdata.to_csv(r'/Users/emilee/Documents/TotallyMoney_Data_Scientist_Technical_Test/data/ABdata.csv', index = False)

In [None]:
visits.shape
print("There is data for",len(visits.index), "visits", "with",visits['user_id'].nunique(), "unique id")

In [None]:
# Dataset has 10,000 observations in it 

Normally you would do a fair amount of exploratory and univariate data analysis before the AB testing but to keep things concise we are moving straight on to the AB testing part

In [None]:
## Create summary table with totals, conversions and proportions
pd.options.display.float_format = "{:.2f}".format
summary = ABTest_data.pivot_table(values='converted', index='variation', aggfunc=np.sum)
summary['total'] = ABTest_data.pivot_table(values='converted', index='variation', aggfunc=lambda x: len(x))
summary['perc'] = summary['converted']/summary['total']
summary

We have 4,976 users in our control group and 5,024 in our treatment group.
Of those there were 999 conversions in the control group (20%) and 1,211 conversions in the treatment group (24%)

**Testing to see if there is a significant difference** 

In [None]:
# data wrangling, changing type
converted = np.array(summary['converted'])
group_totals = np.array(summary['total'])

In [None]:
## Calculate Diffference between the two groups and look at 95% CI
significance = 0.05
confidence = 1 - significance
z = stats.norm(loc = 0, scale = 1).ppf(confidence + significance / 2)

success_a = converted[1]
success_b = converted[0]
size_a = group_totals[1]
size_b = group_totals[0]

prop_a = success_b / size_b
prop_b = success_a / size_a
var = prop_a * (1 - prop_a) / size_a + prop_b * (1 - prop_b) / size_b
se = np.sqrt(var)
    
# standard formula for the confidence interval: point-estimtate +- z * standard-error
prop_diff = prop_b - prop_a
confint = prop_diff + np.array([-1, 1]) * z * se

print('estimate difference:', prop_diff)
print('confidence interval:', confint)

In [None]:
# See whether there is a statistically significant difference 
stat, pval = ssp.proportions_ztest(converted, group_totals)
print('p value: ', pval)
print('formated p value: ''{0:0.3f}'.format(pval))