In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chi2_contingency, beta, binom
from statsmodels.stats.proportion import proportions_ztest
from matplotlib.lines import Line2D
import plotly.graph_objs as go


In [6]:
data_frame = pd.read_csv('../data/AdSmartABdata.csv')

Fetch our AdSmartABdata from our data folder

In [7]:
data = data_frame.query('yes == 1 | no == 1')

exposed = data.query('experiment == "exposed"')
control = data.query('experiment == "control"')

# number of observation in exposed and control groups
print('The exposed group = {} observations'. format(exposed.shape[0]))
print('The control group = {} observations'. format(control.shape[0]))

exposed_mean = '{0:.2f}%'.format(exposed.yes.mean()*100)
control_mean = '{0:.2f}%'.format(control.yes.mean()*100)
print('The conversion rate of exposed group = {}'.format(exposed_mean))
print('The conversion rate of control group = {}'.format(control_mean))

The exposed group = 657 observations
The control group = 586 observations
The conversion rate of exposed group = 46.88%
The conversion rate of control group = 45.05%


In [8]:
difference = exposed.yes.mean() - control.yes.mean()
difference_form = '{0:.2f}%'.format(difference*100)
print('Their difference is {}'.format(difference_form))

Their difference is 1.83%


In [9]:
exposed_converted = exposed.query('yes == 1')
control_converted = control.query('yes == 1')
print('The sample sizes of the exposed group is {}'.format(exposed.shape[0]))
print('The sample sizes of the controled group is {}'.format(control.shape[0]))

print('The conversion rates for the exposed group and control group is {} and {} respectively'.format(exposed_converted.shape[0],control_converted.shape[0]))

The sample sizes of the exposed group is 657
The sample sizes of the controled group is 586
The conversion rates for the exposed group and control group is 308 and 264 respectively


In [10]:
exposed_converted = exposed.query('yes == 1')
control_converted = control.query('yes == 1')
print('The sample sizes of the exposed group is {}'.format(exposed.shape[0]))
print('The sample sizes of the controled group is {}'.format(control.shape[0]))

print('The conversion rates for the exposed group and control group is {} and {} respectively'.format(exposed_converted.shape[0],                                                                                        control_converted.shape[0]))

The sample sizes of the exposed group is 657
The sample sizes of the controled group is 586
The conversion rates for the exposed group and control group is 308 and 264 respectively


In [11]:
# calculating the p-value
# convert the groups in to an array
count = np.array([exposed_converted.shape[0], control_converted.shape[0]])
# converting the sample size to an array
sample = np.array([exposed.shape[0], control.shape[0]])
#storing results of z test in variables
stat, p_val = proportions_ztest(count, sample, alternative = 'larger')
p_value = '{0:.2f}%'.format(p_val*100)
print('P-value = {}'.format(p_value))

P-value = 25.92%


In [12]:
exposed_simulation = np.random.binomial(exposed.shape[0], exposed.yes.mean(), 100000)/exposed.shape[0]
control_simulation = np.random.binomial(control.shape[0], control.yes.mean(), 100000)/control.shape[0]
p_diffs = exposed_simulation - control_simulation
diffs = np.array(p_diffs)

#Creation of normal distribution centered at zero
null_vals = np.random.normal(0, np.std(diffs), len(diffs))