In [124]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np
import sys
import warnings
# Image Disp
from IPython.display import Image
# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder
# To fill missing values
from sklearn.impute import SimpleImputer
# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
# from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

warnings.filterwarnings("ignore")

In [125]:
data = pd.read_csv('../data/AdSmartABdata.csv')

In [126]:
df=data.copy()

In [127]:
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [128]:
df.dtypes

auction_id     object
experiment     object
date           object
hour            int64
device_make    object
platform_os     int64
browser        object
yes             int64
no              int64
dtype: object

In [129]:
df_non_numerical=df.select_dtypes(include=['object'])
df_non_numerical

Unnamed: 0,auction_id,experiment,date,device_make,browser
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,Generic Smartphone,Chrome Mobile
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,Generic Smartphone,Chrome Mobile
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,E5823,Chrome Mobile WebView
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,Samsung SM-A705FN,Facebook
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,Generic Smartphone,Chrome Mobile
...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,Generic Smartphone,Chrome Mobile
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,Generic Smartphone,Chrome Mobile
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,Generic Smartphone,Chrome Mobile
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,Samsung SM-A515F,Samsung Internet


In [130]:
df.columns.tolist()

['auction_id',
 'experiment',
 'date',
 'hour',
 'device_make',
 'platform_os',
 'browser',
 'yes',
 'no']

- Split data by browser and platform_os, and version each split as a new version of the data in dvc.

Split data by browser

In [131]:
# use the data where respondents are not neutral
df_browser= df[['experiment','date','hour','device_make','browser','yes','no']]
df_browser.query('not (yes == 0 & no == 0)',inplace=True)


In [132]:
df_browser

Unnamed: 0,experiment,date,hour,device_make,browser,yes,no
2,exposed,2020-07-05,2,E5823,Chrome Mobile WebView,0,1
16,exposed,2020-07-04,16,Generic Smartphone,Chrome Mobile,1,0
20,exposed,2020-07-06,8,Generic Smartphone,Chrome Mobile,0,1
23,control,2020-07-08,4,Samsung SM-A202F,Facebook,1,0
27,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,1
...,...,...,...,...,...,...,...
8059,exposed,2020-07-05,21,Generic Smartphone,Chrome Mobile,1,0
8063,exposed,2020-07-04,1,Generic Smartphone,Chrome Mobile,1,0
8064,control,2020-07-09,7,Generic Smartphone,Chrome Mobile,0,1
8069,control,2020-07-10,16,Generic Smartphone,Chrome Mobile,0,1


In [133]:
# convert the data and experiment to numerical 
df_browser_non_num=df_browser.select_dtypes(include=['object'])
df_browser_non_num['date_1']= pd.to_datetime(df_browser_non_num['date'])
df_browser_non_num['experiment_1'] = df_browser_non_num['experiment'].map({'exposed': 1, 
                                    'control': 0})
df_browser_non_num['experiment'].value_counts()

exposed    657
control    586
Name: experiment, dtype: int64

In [134]:
df_browser_non_num.drop(['experiment',	'date'], axis=1, inplace=True)

In [None]:
categorical_column= df_browser_non_num.select_dtypes(include='object').columns.tolist()
categorical_column

In [None]:
# label_encoder  device_maker and browser
label_encoder=LabelEncoder()

# Encode labels in column 'device_make'.
df_browser_non_num['device_make']= label_encoder.fit_transform(df_browser_non_num['device_make'])
df_browser_non_num['browser']= label_encoder.fit_transform(df_browser_non_num['browser'])
# rename columns
df_browser_non_num=df_browser_non_num.rename({'date_1': 'date',
                         'experiment_1': 'experiment'}, axis='columns')
df_browser_non_num.reset_index(drop=True)

In [158]:
df_browser_non_num[['hour','yes','no']]= df_browser[['hour','yes','no']]
df_browser_non_num
# rename the columns
columns=[ 'date', 'hour','device_make', 'browser','experiment','yes',	'no']
df_browser_non_num= df_browser_non_num[columns]
df_browser_non_num
df_browser_non_num.reset_index(drop=True)

Unnamed: 0,date,hour,device_make,browser,experiment,yes,no
0,2020-07-05,2,4,2,1,0,1
1,2020-07-04,16,13,1,1,1,0
2,2020-07-06,8,13,1,1,0,1
3,2020-07-08,4,43,4,0,1,0
4,2020-07-03,15,13,1,0,0,1
...,...,...,...,...,...,...,...
1238,2020-07-05,21,13,1,1,1,0
1239,2020-07-04,1,13,1,1,1,0
1240,2020-07-09,7,13,1,0,0,1
1241,2020-07-10,16,13,1,0,0,1


Save the dataset

In [159]:
# '../data/AdSmartABdata.csv'
df_browser_non_num.to_csv('../data/AdSmartABdata_browser.csv')

Split data by platform_os

In [170]:
df_platform= df[['experiment','date','hour','device_make','platform_os','yes','no']]
df_platform.query('not (yes == 0 & no == 0)',inplace=True)

In [171]:
df_platform

Unnamed: 0,experiment,date,hour,device_make,platform_os,yes,no
2,exposed,2020-07-05,2,E5823,6,0,1
16,exposed,2020-07-04,16,Generic Smartphone,6,1,0
20,exposed,2020-07-06,8,Generic Smartphone,6,0,1
23,control,2020-07-08,4,Samsung SM-A202F,6,1,0
27,control,2020-07-03,15,Generic Smartphone,6,0,1
...,...,...,...,...,...,...,...
8059,exposed,2020-07-05,21,Generic Smartphone,6,1,0
8063,exposed,2020-07-04,1,Generic Smartphone,6,1,0
8064,control,2020-07-09,7,Generic Smartphone,6,0,1
8069,control,2020-07-10,16,Generic Smartphone,6,0,1


- label encoding

In [172]:
# convert the data and experiment to numerical 
df_platform_non_num=df_platform.select_dtypes(include=['object'])
df_platform_non_num['date_1']= pd.to_datetime(df_platform_non_num['date'])
df_platform_non_num['experiment_1'] = df_platform_non_num['experiment'].map({'exposed': 1, 
                                    'control': 0})
# df_platform_non_num['experiment'].value_counts()

# df_platform_non_num=df_platform_non_num.reset_index(drop=True)
df_platform_non_num.drop(['date','experiment' ],axis=1,inplace=True)

In [173]:
# Encode labels in column 'device_make'.
df_platform_non_num['device_make']= label_encoder.fit_transform(df_platform_non_num['device_make'])
df_platform_non_num['platform_os']=df_platform['platform_os']

In [174]:
df_platform_non_num

Unnamed: 0,device_make,date_1,experiment_1,platform_os
2,4,2020-07-05,1,6
16,13,2020-07-04,1,6
20,13,2020-07-06,1,6
23,43,2020-07-08,0,6
27,13,2020-07-03,0,6
...,...,...,...,...
8059,13,2020-07-05,1,6
8063,13,2020-07-04,1,6
8064,13,2020-07-09,0,6
8069,13,2020-07-10,0,6


In [175]:
df_platform_non_num[['hour','yes','no']]= df_browser[['hour','yes','no']]
df_platform_non_num
# rename the columns
columns=[ 'date_1', 'hour','device_make', 'platform_os','experiment_1','yes',	'no']
df_platform_non_num= df_platform_non_num[columns]
df_platform_non_num=df_platform_non_num.rename({'date_1': 'date',
                         'experiment_1': 'experiment'}, axis='columns')
df_platform_non_num.reset_index(drop=True)

Unnamed: 0,date,hour,device_make,platform_os,experiment,yes,no
0,2020-07-05,2,4,6,1,0,1
1,2020-07-04,16,13,6,1,1,0
2,2020-07-06,8,13,6,1,0,1
3,2020-07-08,4,43,6,0,1,0
4,2020-07-03,15,13,6,0,0,1
...,...,...,...,...,...,...,...
1238,2020-07-05,21,13,6,1,1,0
1239,2020-07-04,1,13,6,1,1,0
1240,2020-07-09,7,13,6,0,0,1
1241,2020-07-10,16,13,6,0,0,1


In [176]:
df_platform_non_num

Unnamed: 0,date,hour,device_make,platform_os,experiment,yes,no
2,2020-07-05,2,4,6,1,0,1
16,2020-07-04,16,13,6,1,1,0
20,2020-07-06,8,13,6,1,0,1
23,2020-07-08,4,43,6,0,1,0
27,2020-07-03,15,13,6,0,0,1
...,...,...,...,...,...,...,...
8059,2020-07-05,21,13,6,1,1,0
8063,2020-07-04,1,13,6,1,1,0
8064,2020-07-09,7,13,6,0,0,1
8069,2020-07-10,16,13,6,0,0,1


In [177]:
# '../data/AdSmartABdata.csv'
df_platform_non_num.to_csv('../data/AdSmartABdata_platform.csv')

<b>`For each version of the data do the following:`</b> 
- Split the data into 70% training, 20% validation, and 10% test sets. 
- Based on the reading material provided, apply machine learning to the training data. 
- Train a machine learning model using 5-fold cross validation using the following 3 different algorithms:
- Logistic Regression 
- Decision Trees
- XGBoost
- RandomForest


<b>` Split the data into 70% training, 20% validation, and 10% test sets.`</b>   