In [1]:
# imports
import os
import sys
import dvc.api
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
# set up paths and helper scripts
sys.path.append('.')
sys.path.insert(1, '../scripts/')

import defaults as defs
import dataCleaner as dc
import dataVisualizer as dv
import abTestHelper as th

# setup helper scripts
cleaner = dc.dataCleaner('classical hypothesis testing notebook')
visualizer = dv.dataVisualizer('classical hypothesis testing notebook')
abTestHelper = th.abTestHelper('classical hypothesis testing notebook')

logger <Logger dataCleaner (INFO)> created at path: ../logs/cleaner_root.log
Data cleaner in action
logger <Logger dataVisualizer (INFO)> created at path: ../logs/visualizer_root.log
Data visualizer in action
logger <Logger abTestHelper (INFO)> created at path: ../logs/hypothesis_test_root.log
Hypothesis test helper in action


In [3]:
# read data using dvc
version = 'v1'

# data path using dvc api
data_url = dvc.api.get_url(path = defs.path, repo = defs.repo, rev = version)

# reading the csv file
missing_values = ["n/a", "na", "undefined", '?', 'NA', 'undefined']
df = pd.read_csv(data_url, na_values=missing_values)
df

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,Samsung Internet,0,0


# Classical AB testing

## Metric choice

### Invariate Metrics - sanity checks

These metrics are not supposed to change dramatically whether or not users were shown different types of ads. Since no data is provided about the baseline performance of the SmartAd traffic we could not do many types of invariate metrics (sanity check).

These metrics are going to be evaluated on the total experiment data set. On all the 8077 records.

1. Number of page views for the exposed and control groups.

* This difference is not supposed to be significant as to affect the insights that are going to be evaluated.This will later be checked if the difference in amounts is not significant and is random and even like we expected.

2. Number of BIO participants for the exposed and control groups.

* This difference should not be significant enough to affect the insights that are going to be evaluated. This will later be checked if the difference in amount is not significant and is random and even like we expect.

### Evaluation metrics - performance indicators

These metrics are supposed to change, given the hypothesis test, based on the different ads people were shown.

These metrics are going to be evaluated only on the BIO participants. Because brand awareness was determined by this BIO question.

1. The main performance indicator we are going to use is going to be the increase in brand awareness in the exposed groups than the control group due to the altered ads.

* For the purpose of demonstration I am going to use a d_min = 0.01. At least a 1% increase in awareness in the exposed group needs to be observed in order to pass the practical significance of the bushiness.

* Stating that any increase in awareness that is under 1%, even if statistically significant, is not practical to the business.

## Analyzing data

In [4]:
# drop the values of un answered BIO records
answered_data = df.query("not (yes == 0 & no == 0)")
answered_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1243 entries, 2 to 8071
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   auction_id   1243 non-null   object
 1   experiment   1243 non-null   object
 2   date         1243 non-null   object
 3   hour         1243 non-null   int64 
 4   device_make  1243 non-null   object
 5   platform_os  1243 non-null   int64 
 6   browser      1243 non-null   object
 7   yes          1243 non-null   int64 
 8   no           1243 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 97.1+ KB


* Drop all the BIO non participants

In [5]:
# merge the yes and no features, 
cleaned_data = answered_data[['auction_id', 'experiment', 'yes', 'no']]
cleaned_data.insert(column='aware', value=0, loc=4)
cleaned_data.loc[cleaned_data['yes'] == 1, 'aware'] = 1
cleaned_data.loc[cleaned_data['yes'] == 0, 'aware'] = 0
cleaned_data = cleaned_data.drop(columns=['yes', 'no'])
cleaned_data = cleaned_data.set_index('auction_id')
cleaned_data

Unnamed: 0_level_0,experiment,aware
auction_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,0
008aafdf-deef-4482-8fec-d98e3da054da,exposed,1
00a1384a-5118-4d1b-925b-6cdada50318d,exposed,0
00b6fadb-10bd-49e3-a778-290da82f7a8d,control,1
00ebf4a8-060f-4b99-93ac-c62724399483,control,0
...,...,...
ffa08ff9-a132-4051-aef5-01a9c79367bc,exposed,1
ffb176df-ecd2-45d3-b05f-05b173a093a7,exposed,1
ffb79718-6f25-4896-b6b3-e58b80a6e147,control,0
ffca1153-c182-4f32-9e90-2a6008417497,control,0


In [6]:
# format table to make it suitable for awareness rate calculations
grouped_data = cleaned_data.groupby('experiment').sum()
grouped_data['total'] = cleaned_data.pivot_table(index='experiment', aggfunc='count')
grouped_data['awareness rate'] = cleaned_data.pivot_table( index='experiment')
grouped_data['not aware'] = grouped_data['total'] - grouped_data['aware']
grouped_data = grouped_data.reindex(['aware', 'not aware', 'total', 'awareness rate'], axis=1)
grouped_data

Unnamed: 0_level_0,aware,not aware,total,awareness rate
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,264,322,586,0.450512
exposed,308,349,657,0.468798


## Invariant metrics - sanity checks

### 1. Number of page views (on the total data set (8077 records))

In [7]:
total_control_pv = df['experiment'].value_counts()['control']
total_exposed_pv = df['experiment'].value_counts()['exposed']
complete_total_pv = total_exposed_pv + total_control_pv

print (f"number of total page views in control: {total_control_pv}")
print (f"number of total Page views in experiment: {total_exposed_pv}")
print (f"number of total page views: {complete_total_pv}")

number of total page views in control: 4071
number of total Page views in experiment: 4006
number of total page views: 8077


In [8]:
# check if the difference in amount is not significant
p = 0.5     # we are expecting the amount of people in the exposed and control groups probability to be 50%
alpha = 0.05
p_hat = round(total_control_pv / (complete_total_pv),4)
sd = abTestHelper.get_std(p, complete_total_pv)
ME = round(abTestHelper.get_z_score(1-(alpha/2)) * sd, 4)
print ("The confidence interval is between", p-ME, "and", p+ME,"; Is", p_hat, "inside this range?", p-ME<p_hat<p+ME)

The confidence interval is between 0.4891 and 0.5109 ; Is 0.504 inside this range? True


### 2. Number of BIO participants (on the total data set (8077 records))

In [9]:
bio_participants_df = df.groupby('experiment').sum()
bio_participants_df['total'] = bio_participants_df['yes'] + bio_participants_df['no']

bio_participants_control = bio_participants_df['total']['control']
bio_participants_control_pv = total_control_pv

bio_participants_exposed = bio_participants_df['total']['exposed']
bio_participants_exposed_pv = total_exposed_pv
print(f"number of BIO participants: {bio_participants_df['total']}\nTotal participants: {bio_participants_df['total'].sum()}")

number of BIO participants: experiment
control    586
exposed    657
Name: total, dtype: int64
Total participants: 1243


In [10]:
rate_of_participation_control = round(bio_participants_control / bio_participants_control_pv, 4)
rate_of_participation_exposed = round(bio_participants_exposed / bio_participants_exposed_pv, 4)
print(f'rate of bio participation - control: {rate_of_participation_control}\nrate of bio participation - exposed: {rate_of_participation_exposed}')


rate of bio participation - control: 0.1439
rate of bio participation - exposed: 0.164


* In this case, we want to make sure the proportion of BIO is about the same in both groups (since this was not expected to change due to the experiment).

* The changes we should notice are for the calculation of the standard error - which in this case is a pooled standard error.

In [11]:
# che
d_hat = round(rate_of_participation_exposed - rate_of_participation_control,4)
BIO_participants_total = bio_participants_df['total'].sum() 
p_pooled = BIO_participants_total / complete_total_pv
sd_pooled = abTestHelper.get_pooled_std(p_pooled, bio_participants_control_pv, bio_participants_exposed_pv) 
ME = round(abTestHelper.get_z_score(1-(alpha/2))*sd_pooled,4)
print ("The confidence interval is between",0-ME,"and",0+ME,"; Is",d_hat,"within this range?", 0-ME<d_hat<0+ME)

The confidence interval is between -0.0157 and 0.0157 ; Is 0.0201 within this range? False


* As we can see this did not pass the statistical significance, there might be something else to investigate.

In [12]:
print(f"There seems to be around a {d_hat*100}% increase in BIO engagement in the exposed groups than the control ones.")

There seems to be around a 2.01% increase in BIO engagement in the exposed groups than the control ones.


* We could say we, on average, hit around 15% BIO engagement mark for both the control and the exposed groups.
* We also could say there was around a 2% increase in the BIO engagement in the exposed group than the control one.
* Nevertheless, a statistical significance is not achieved.

## Evaluation metrics - performance indicators

### 1. Increase in awareness (on the BIO participants (1024 records))

In [13]:
grouped_data

Unnamed: 0_level_0,aware,not aware,total,awareness rate
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,264,322,586,0.450512
exposed,308,349,657,0.468798


In [14]:
print (f"BIO participants in control: {bio_participants_control}")
print (f"BIO participants in experiment: {bio_participants_exposed}")
print (f"Total BIO participants: {bio_participants_control+bio_participants_exposed}\n")

print (f"Brand awareness rate in control: {round(grouped_data['awareness rate']['control'], 4)}")
print (f"Brand awareness rate in experiment: {round(grouped_data['awareness rate']['exposed'], 4)}\n")

awareness_diff = round(grouped_data['awareness rate']['exposed'] - grouped_data['awareness rate']['control'], 4)
print(f"There seems to be around a {awareness_diff * 100}% increase in brand awareness in the exposed groups than the control ones.")

BIO participants in control: 586
BIO participants in experiment: 657
Total BIO participants: 1243

Brand awareness rate in control: 0.4505
Brand awareness rate in experiment: 0.4688

There seems to be around a 1.83% increase in brand awareness in the exposed groups than the control ones.


* This will pas the practical significance we set earlier as 1%.

* Next we will check if this rate in brand awareness is statistically significant.

In [16]:
awareness_pooled = ((grouped_data['aware']['control']  +  grouped_data['aware']['exposed']) 
                    / (grouped_data['total']['control'] + grouped_data['total']['exposed']))

awareness_sd_pooled = abTestHelper.get_pooled_std(awareness_pooled, grouped_data['total']['control'], grouped_data['total']['exposed']) 
awareness_ME = round(abTestHelper.get_z_score(1-alpha/2) * awareness_sd_pooled,4)

print("The change due to the experiment is",awareness_diff*100,"%")
print("Confidence Interval: [",awareness_diff - awareness_ME,",",awareness_diff + awareness_ME,"]")
print ("The change is statistically significant if the CI doesn't include 0.\nIn that case, it is practically significant if",{0.01},"is not in the CI as well.")


The change due to the experiment is 1.83 %
Confidence Interval: [ -0.0372 , 0.0738 ]
The change is statistically significant if the CI doesn't include 0.
In that case, it is practically significant if {0.01} is not in the CI as well.


* This indicates that the outcome has an increase change sized in around 1.83%.

* Sure this is a moderate increase but not statistically and practically significant.