In [16]:
# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
import scipy.stats
import math
booking = pd.read_csv("AB_test_data.csv")

### Group with Cindy, Linda, Fanbi, Yining

In [3]:
booking.columns

Index(['Variant', 'date', 'id', 'purchase_TF'], dtype='object')

In [4]:
session_counts = booking['id'].value_counts(ascending=False)
multi_users = session_counts[session_counts > 1].count()
print(f'There are {multi_users} users that appear multiple times in the dataset')

There are 0 users that appear multiple times in the dataset


### 1. Conduct an A/B test to determine whether Alternative B improved conversion rates (site users book the property) over alternative A.

In [5]:
# Split data into group A and group B
control = booking.loc[booking["Variant"] == "A"]
treat = booking.loc[booking["Variant"] == "B"]

# 
control_true = control["purchase_TF"] == True
treat_true = treat["purchase_TF"] == True

# number of control = true and treat = true
n_control_true = sum(control_true)
n_treat_true = sum(treat_true)

In [6]:
n_control = control_true.count()
n_treat = treat_true.count()
successes = [n_control_true, n_treat_true]
nobs = [n_control, n_treat]

In [7]:
z_stat, pval = proportions_ztest(successes, nobs=nobs,alternative="smaller")
print(f'p-value: {pval:.10f}') 

p-value: 0.0000000845


### The p-value is 0.0000000845. At alpha level = 0.05, we reject null hypothesis. We cconclude that alternative B improved conversion rate

### 2. Calculate the optimal sample size for a 95% confidence rate and test with 80% power. Conduct the test 10 times using samples of the optimal size. Report results.

In [8]:
from IPython.display import display, Math, Latex
display(Math( r'n = (t_{\alpha/2} * \sqrt{2p(p-1)} + t_{\beta} * \sqrt{p_{0}(1-p_{0}) + p_{1}(1-p_{1})})^2 * 1/{\delta}^2 '  ))
#display(Math(r'n=\frac{2 * p * (1 - p) * (Z_{\alpha / 2} + Z_{\beta})^2}{(p_B + p_a)^2}'))
             
                     

<IPython.core.display.Math object>

In [9]:
t_a_2 = scipy.stats.t.ppf(q=.05/2,df=(booking.shape[0]-1))
t_b = scipy.stats.t.ppf(q=0.2,df=(booking.shape[0]-1))
#1- β is the selected power (0.8)

p0 = n_control_true / control.shape[0]
p1 = n_treat_true / treat.shape[0]
p = (p0+p1)/2 # https://www.nber.org/system/files/working_papers/w15701/w15701.pdf

In [10]:
optimal_size = ( -t_a_2 * ( 2 * p * (1-p) )**0.5 + ( -t_b * ( p0 * (1-p0) + p1 * (1-p1) )** 0.5 ) )**2 * (1 / (p0-p1)**2)
# delta = p0-p1
print("The optimial size of each group is",optimal_size )

The optimial size of each group is 2941.7255370018324


In [11]:
# https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f
# minimum size
#optimal_size = 2*p*(1-p)*(t_a_2+t_b)**2/(p1-p0)**2
#print("The optimial size of each group is",optimal_size )

In [11]:
#sdt = (p*(1-p)/booking.shape[0])**0.5

In [12]:
# Randomly sample 2943 samples in each group
from random import sample
import numpy as np

for i in range(10,20):

    control_opt = control.sample( int(np.ceil(optimal_size)),random_state=i ) #np.ceilwill round up the number
    treat_opt = treat.sample( int(np.ceil(optimal_size)),random_state=i ) 
    
    control_opt_true = control_opt["purchase_TF"] == True
    treat_opt_true = treat_opt["purchase_TF"] == True
    
    n_control_opt_true = sum(control_opt_true)
    n_treat_opt_true = sum(treat_opt_true)
    
    successes = [n_control_opt_true, n_treat_opt_true]
    pval = proportions_ztest(successes, nobs = int(np.ceil(optimal_size))*2,alternative="smaller")[1]
    print(pval)

0.02947087793433457
0.012811800827857371
0.013434916142272994
0.01974712246575
0.017143336250841672
0.06013882864599316
0.00019577907377768831
0.01594092779335723
1.9248012988440275e-05
0.00013817198905973678


### 3. Conduct a sequential test for the 10 samples. For any of the samples, were you able to stop the test prior to using the full sample? What was the average number of iterations required to stop the test?

In [18]:

a_bound = math.log(1/0.05)
b_bound = math.log(0.2)

boundary_test = 0
iteration = 0
p0 = n_control_true / control.shape[0]
p1 = n_treat_true / treat.shape[0]
sum_iter = 0

for i in range(10,20):
    boundary_test = 0
    iteration = 0
    for j in range( int(np.ceil(optimal_size)) ):
        sample = treat.sample( int(np.ceil(optimal_size)),random_state=i ).reset_index()
        if sample.loc[j,"purchase_TF"] == True:
            boundary_test = boundary_test + math.log(p1/p0)
            iteration = iteration + 1
            #print(boundary_test)
        else:
            boundary_test = boundary_test + math.log((1-p1)/(1-p0))
            iteration = iteration + 1
            #print(boundary_test)
        if boundary_test >=  a_bound or boundary_test <= b_bound:
            break
    sum_iter = sum_iter + iteration
    print(iteration)
    print(boundary_test)

1028
3.096407695880565
673
3.0561548035561894
199
3.09000194212208
272
-1.6406831169556917
1441
3.0489488373609697
1495
3.0902206978748916
679
3.0607405658355105
286
3.057464905514156
500
3.088982973932549
398
3.0110250151840394


In [21]:
# Average iteration
print(sum_iter/10)

697.1


### Conclusion: On Average, it requires 698 (697.1) iterations to stop the test. In our 10 samples, the test stops prior to using the full sample. 9 out of 10 trials reached the upper boundary, and we would like to accept H1 and reject the Ho. In only 1 sample, we failed to reject Ho.