# AD CAMPAIGN PERFORMANCE

## Business Objective

The main objective of this project is to test if the ads that the advertising company runs resulted in a significant lift in brand awareness. 


## Project workflow


*   Setting up A/B testing framework
*   Validating the data validity
*   Performing A/B testing with classical, sequential and Machine learning methods
*   Extracting statistically valid insights in relation to the  business objective



In [None]:
#Importing libraries that we will be using
%matplotlib inline
!pip install -U pandas-profiling

import statsmodels.api as sm
import numpy as np # linear algebra
import scipy.stats as stats
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from tabulate import tabulate
import pandas_profiling
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings(action="ignore")

from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.ensemble import RandomForestRegressor

In [43]:
#Reading our data
bio_data=pd.read_csv('/content/drive/My Drive/ABAdRecall.csv')

In [44]:
bio_data.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


## BIO Data Exploration

In [45]:
bio_data.info() # getting the information of our dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8077 entries, 0 to 8076
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   auction_id   8077 non-null   object
 1   experiment   8077 non-null   object
 2   date         8077 non-null   object
 3   hour         8077 non-null   int64 
 4   device_make  8077 non-null   object
 5   platform_os  8077 non-null   int64 
 6   browser      8077 non-null   object
 7   yes          8077 non-null   int64 
 8   no           8077 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 568.0+ KB


In [46]:
#check the decoration
bio_data.columns

Index(['auction_id', 'experiment', 'date', 'hour', 'device_make',
       'platform_os', 'browser', 'yes', 'no'],
      dtype='object')

In [47]:
bio_data['auction_id'].nunique()

8077

In [48]:
profile = pandas_profiling.ProfileReport(bio_data)
profile.to_file('profile_report.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=23.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [None]:
profile

In [50]:
date_data = bio_data.groupby(['date']).agg({'date': ['count']})
date_data.columns = ['count']
date_data = date_data.reset_index()
print(tabulate(date_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════╤═════════╕
│    │ date       │   count │
╞════╪════════════╪═════════╡
│  0 │ 2020-07-03 │    2015 │
├────┼────────────┼─────────┤
│  1 │ 2020-07-04 │     903 │
├────┼────────────┼─────────┤
│  2 │ 2020-07-05 │     890 │
├────┼────────────┼─────────┤
│  3 │ 2020-07-06 │     490 │
├────┼────────────┼─────────┤
│  4 │ 2020-07-07 │     480 │
╘════╧════════════╧═════════╛


In [51]:
fig = px.histogram(bio_data, x="date")
fig.show()

In [52]:
hour_data = bio_data.groupby(['hour']).agg({'hour': ['count']})
hour_data.columns = ['count']
hour_data = hour_data.reset_index()
print(tabulate(hour_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════╤═════════╕
│    │   hour │   count │
╞════╪════════╪═════════╡
│  0 │      0 │     194 │
├────┼────────┼─────────┤
│  1 │      1 │     222 │
├────┼────────┼─────────┤
│  2 │      2 │     230 │
├────┼────────┼─────────┤
│  3 │      3 │     266 │
├────┼────────┼─────────┤
│  4 │      4 │     281 │
╘════╧════════╧═════════╛


In [53]:
#a function for getting the top ten users per application    
def unique_count(dataframe, variable):
    unique_values=dataframe.groupby([variable]).agg({variable: ['count']})
    unique_values.columns= ['count']
    unique_values= unique_values.reset_index
    print('Count of unique values : '+ variable)
    print('------------------------------------------------')
    print(tabulate(unique_values, headers = 'keys', tablefmt = 'fancy_grid'))

In [54]:
#unique_count(bio_data,'browser')

In [55]:
fig = px.histogram(bio_data, x="hour")
fig.show()

In [56]:
device_data = bio_data.groupby(['device_make']).agg({'device_make': ['count']})
device_data.columns = ['count']
device_data = device_data.reset_index()
print(tabulate(device_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤═══════════════╤═════════╕
│    │ device_make   │   count │
╞════╪═══════════════╪═════════╡
│  0 │ 5008Y_EEA     │       1 │
├────┼───────────────┼─────────┤
│  1 │ 5099Y         │       1 │
├────┼───────────────┼─────────┤
│  2 │ 6039Y         │       1 │
├────┼───────────────┼─────────┤
│  3 │ A0001         │       2 │
├────┼───────────────┼─────────┤
│  4 │ ALE-L21       │       1 │
╘════╧═══════════════╧═════════╛


In [57]:
fig = px.histogram(bio_data, x="device_make")
fig.show()

In [58]:
experiment_groups = bio_data.groupby(['experiment']).agg({'experiment': ['count']})
experiment_groups.columns = ['count']
experiment_groups = experiment_groups.reset_index()
print(tabulate(experiment_groups, headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤══════════════╤═════════╕
│    │ experiment   │   count │
╞════╪══════════════╪═════════╡
│  0 │ control      │    4071 │
├────┼──────────────┼─────────┤
│  1 │ exposed      │    4006 │
╘════╧══════════════╧═════════╛


In [59]:
fig = px.histogram(bio_data, x="experiment")
fig.show()

In [60]:
browser_types = bio_data['browser'].value_counts()
browser_types.columns = ['count']
browser_types = browser_types.reset_index()
print(tabulate(browser_types, headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════════════════════╤═══════════╕
│    │ index                      │   browser │
╞════╪════════════════════════════╪═══════════╡
│  0 │ Chrome Mobile              │      4554 │
├────┼────────────────────────────┼───────────┤
│  1 │ Chrome Mobile WebView      │      1489 │
├────┼────────────────────────────┼───────────┤
│  2 │ Samsung Internet           │       824 │
├────┼────────────────────────────┼───────────┤
│  3 │ Facebook                   │       764 │
├────┼────────────────────────────┼───────────┤
│  4 │ Mobile Safari              │       337 │
├────┼────────────────────────────┼───────────┤
│  5 │ Chrome Mobile iOS          │        51 │
├────┼────────────────────────────┼───────────┤
│  6 │ Mobile Safari UI/WKWebView │        44 │
├────┼────────────────────────────┼───────────┤
│  7 │ Pinterest                  │         3 │
├────┼────────────────────────────┼───────────┤
│  8 │ Chrome                     │         3 │
├────┼────────────────────────────┼─────

In [61]:
fig = px.histogram(bio_data, x="browser")
fig.show()

In [62]:
bio_data.isnull().sum() #checking for any null values in our data

auction_id     0
experiment     0
date           0
hour           0
device_make    0
platform_os    0
browser        0
yes            0
no             0
dtype: int64

In [63]:
exposed_data=bio_data[bio_data['experiment'] == 'exposed']

In [64]:
exposed_data

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
8,004940f5-c642-417a-8fd2-c8e5d989f358,exposed,2020-07-04,0,Generic Smartphone,6,Chrome Mobile WebView,0,0
13,006b9c6e-5f5d-4385-a811-ff20a24b30ac,exposed,2020-07-06,8,Samsung SM-G973F,6,Chrome Mobile WebView,0,0
...,...,...,...,...,...,...,...,...,...
8065,ffbc02cb-628a-4de5-87fc-5d76b7d796e5,exposed,2020-07-09,17,Generic Smartphone,6,Chrome Mobile,0,0
8067,ffc594ef-756c-4d24-a310-0d8eb4e11eb7,exposed,2020-07-05,1,Samsung SM-G950F,6,Chrome Mobile WebView,0,0
8071,ffdfdc09-48c7-4bfb-80f8-ec1eb633602b,exposed,2020-07-03,4,Generic Smartphone,6,Chrome Mobile,0,1
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0


In [75]:
exposed_date_data = exposed_data.groupby(['date']).agg({'date': ['count']})
exposed_date_data.columns = ['count']
exposed_date_data = exposed_date_data.reset_index()
print(tabulate(exposed_date_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════╤═════════╕
│    │ date       │   count │
╞════╪════════════╪═════════╡
│  0 │ 2020-07-03 │     470 │
├────┼────────────┼─────────┤
│  1 │ 2020-07-04 │     477 │
├────┼────────────┼─────────┤
│  2 │ 2020-07-05 │     528 │
├────┼────────────┼─────────┤
│  3 │ 2020-07-06 │     294 │
├────┼────────────┼─────────┤
│  4 │ 2020-07-07 │     257 │
╘════╧════════════╧═════════╛


In [76]:
exposed_hour_data = exposed_data.groupby(['hour']).agg({'hour': ['count']})
exposed_hour_data.columns = ['count']
exposed_hour_data = exposed_hour_data.reset_index()
print(tabulate(exposed_hour_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════╤═════════╕
│    │   hour │   count │
╞════╪════════╪═════════╡
│  0 │      0 │     104 │
├────┼────────┼─────────┤
│  1 │      1 │     138 │
├────┼────────┼─────────┤
│  2 │      2 │     139 │
├────┼────────┼─────────┤
│  3 │      3 │     161 │
├────┼────────┼─────────┤
│  4 │      4 │     173 │
╘════╧════════╧═════════╛


In [77]:
exposed_device_data = exposed_data.groupby(['device_make']).agg({'device_make': ['count']})
exposed_device_data.columns = ['count']
exposed_device_data = exposed_device_data.reset_index()
print(tabulate(exposed_device_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤═══════════════╤═════════╕
│    │ device_make   │   count │
╞════╪═══════════════╪═════════╡
│  0 │ 5099Y         │       1 │
├────┼───────────────┼─────────┤
│  1 │ 6039Y         │       1 │
├────┼───────────────┼─────────┤
│  2 │ A0001         │       1 │
├────┼───────────────┼─────────┤
│  3 │ ALE-L21       │       1 │
├────┼───────────────┼─────────┤
│  4 │ ANE-LX1       │       9 │
╘════╧═══════════════╧═════════╛


In [78]:
exposed_browser_types = exposed_data['browser'].value_counts()
exposed_browser_types.columns = ['count']
exposed_browser_types = exposed_browser_types.reset_index()
print(tabulate(exposed_browser_types, headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════════════════════╤═══════════╕
│    │ index                      │   browser │
╞════╪════════════════════════════╪═══════════╡
│  0 │ Chrome Mobile              │      2144 │
├────┼────────────────────────────┼───────────┤
│  1 │ Chrome Mobile WebView      │      1197 │
├────┼────────────────────────────┼───────────┤
│  2 │ Samsung Internet           │       332 │
├────┼────────────────────────────┼───────────┤
│  3 │ Facebook                   │       203 │
├────┼────────────────────────────┼───────────┤
│  4 │ Mobile Safari              │        91 │
├────┼────────────────────────────┼───────────┤
│  5 │ Chrome Mobile iOS          │        17 │
├────┼────────────────────────────┼───────────┤
│  6 │ Mobile Safari UI/WKWebView │        14 │
├────┼────────────────────────────┼───────────┤
│  7 │ Opera Mobile               │         3 │
├────┼────────────────────────────┼───────────┤
│  8 │ Chrome                     │         2 │
├────┼────────────────────────────┼─────

In [65]:
control_data=bio_data[bio_data['experiment'] == 'control']

In [66]:
control_data

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
5,0027ce48-d3c6-4935-bb12-dfb5d5627857,control,2020-07-03,15,Samsung SM-G960F,6,Facebook,0,0
6,002e308b-1a07-49d6-8560-0fbcdcd71e4b,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
7,00393fb9-ca32-40c0-bfcb-1bd83f319820,control,2020-07-09,5,Samsung SM-G973F,6,Facebook,0,0
...,...,...,...,...,...,...,...,...,...
8069,ffca1153-c182-4f32-9e90-2a6008417497,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0,1
8070,ffcea781-a6e7-4f98-9d90-f95377270476,control,2020-07-03,15,Samsung SM-N976B,6,Facebook,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0


In [79]:
control_date_data = control_data.groupby(['date']).agg({'date': ['count']})
control_date_data.columns = ['count']
control_date_data = control_date_data.reset_index()
print(tabulate(control_date_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════╤═════════╕
│    │ date       │   count │
╞════╪════════════╪═════════╡
│  0 │ 2020-07-03 │    1545 │
├────┼────────────┼─────────┤
│  1 │ 2020-07-04 │     426 │
├────┼────────────┼─────────┤
│  2 │ 2020-07-05 │     362 │
├────┼────────────┼─────────┤
│  3 │ 2020-07-06 │     196 │
├────┼────────────┼─────────┤
│  4 │ 2020-07-07 │     223 │
╘════╧════════════╧═════════╛


In [80]:
control_hour_data = control_data.groupby(['hour']).agg({'hour': ['count']})
control_hour_data.columns = ['count']
control_hour_data = control_hour_data.reset_index()
print(tabulate(control_hour_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════╤═════════╕
│    │   hour │   count │
╞════╪════════╪═════════╡
│  0 │      0 │      90 │
├────┼────────┼─────────┤
│  1 │      1 │      84 │
├────┼────────┼─────────┤
│  2 │      2 │      91 │
├────┼────────┼─────────┤
│  3 │      3 │     105 │
├────┼────────┼─────────┤
│  4 │      4 │     108 │
╘════╧════════╧═════════╛


In [81]:
control_device_data = control_data.groupby(['device_make']).agg({'device_make': ['count']})
control_device_data.columns = ['count']
control_device_data = control_device_data.reset_index()
print(tabulate(control_device_data.head(), headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤═══════════════╤═════════╕
│    │ device_make   │   count │
╞════╪═══════════════╪═════════╡
│  0 │ 5008Y_EEA     │       1 │
├────┼───────────────┼─────────┤
│  1 │ A0001         │       1 │
├────┼───────────────┼─────────┤
│  2 │ ANE-LX1       │       9 │
├────┼───────────────┼─────────┤
│  3 │ ATU-L11       │       1 │
├────┼───────────────┼─────────┤
│  4 │ BBF100-1      │       1 │
╘════╧═══════════════╧═════════╛


In [82]:
control_browser_types = control_data['browser'].value_counts()
control_browser_types.columns = ['count']
control_browser_types = control_browser_types.reset_index()
print(tabulate(control_browser_types, headers = 'keys', tablefmt = 'fancy_grid'))

╒════╤════════════════════════════╤═══════════╕
│    │ index                      │   browser │
╞════╪════════════════════════════╪═══════════╡
│  0 │ Chrome Mobile              │      2410 │
├────┼────────────────────────────┼───────────┤
│  1 │ Facebook                   │       561 │
├────┼────────────────────────────┼───────────┤
│  2 │ Samsung Internet           │       492 │
├────┼────────────────────────────┼───────────┤
│  3 │ Chrome Mobile WebView      │       292 │
├────┼────────────────────────────┼───────────┤
│  4 │ Mobile Safari              │       246 │
├────┼────────────────────────────┼───────────┤
│  5 │ Chrome Mobile iOS          │        34 │
├────┼────────────────────────────┼───────────┤
│  6 │ Mobile Safari UI/WKWebView │        30 │
├────┼────────────────────────────┼───────────┤
│  7 │ Pinterest                  │         2 │
├────┼────────────────────────────┼───────────┤
│  8 │ Puffin                     │         1 │
├────┼────────────────────────────┼─────

## Correlation Analysis

In [68]:
# Step 1 - Make a scatter plot with square markers, set column names as labels

def heatmap(x, y, size):
    fig, ax = plt.subplots()
    
    # Mapping from column names to integer coordinates
    x_labels = [v for v in sorted(x.unique())]
    y_labels = [v for v in sorted(y.unique())]
    x_to_num = {p[1]:p[0] for p in enumerate(x_labels)} 
    y_to_num = {p[1]:p[0] for p in enumerate(y_labels)} 
    
    size_scale = 500
    ax.scatter(
        x=x.map(x_to_num), # Use mapping for x
        y=y.map(y_to_num), # Use mapping for y
        s=size * size_scale, # Vector of square sizes, proportional to size parameter
        marker='s' # Use square as scatterplot marker
    )
    
    # Show column labels on the axes
    ax.set_xticks([x_to_num[v] for v in x_labels])
    ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right')
    ax.set_yticks([y_to_num[v] for v in y_labels])
    ax.set_yticklabels(y_labels)

In [69]:
columns = [ 'platform_os', 'hour', ] 
df1=pd.get_dummies(bio_data)
corr =bio_data[columns].corr()
corr = pd.melt(corr.reset_index(), id_vars='index') # Unpivot the dataframe, so we can get pair of arrays for x and y
corr.columns = ['x', 'y', 'value']

In [70]:
corr

Unnamed: 0,x,y,value
0,platform_os,platform_os,1.0
1,hour,platform_os,6.2e-05
2,platform_os,hour,6.2e-05
3,hour,hour,1.0


In [None]:
heatmap(
    x=corr['x'],
    y=corr['y'],
    size=corr['value'].abs()
)

## Classical A/B Testing