In [3]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np
import sys
import warnings
# Image Disp
from IPython.display import Image
# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder
# To fill missing values
from sklearn.impute import SimpleImputer
# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
warnings.filterwarnings("ignore")

In [39]:
data = pd.read_csv('../data/AdSmartABdata.csv')

In [40]:
df=data.copy()

In [41]:
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [42]:
df.dtypes

auction_id     object
experiment     object
date           object
hour            int64
device_make    object
platform_os     int64
browser        object
yes             int64
no              int64
dtype: object

In [43]:
df_non_numerical=df.select_dtypes(include=['object'])
df_non_numerical

Unnamed: 0,auction_id,experiment,date,device_make,browser
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,Generic Smartphone,Chrome Mobile
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,Generic Smartphone,Chrome Mobile
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,E5823,Chrome Mobile WebView
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,Samsung SM-A705FN,Facebook
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,Generic Smartphone,Chrome Mobile
...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,Generic Smartphone,Chrome Mobile
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,Generic Smartphone,Chrome Mobile
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,Generic Smartphone,Chrome Mobile
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,Samsung SM-A515F,Samsung Internet


In [44]:
df.columns.tolist()

['auction_id',
 'experiment',
 'date',
 'hour',
 'device_make',
 'platform_os',
 'browser',
 'yes',
 'no']

- Split data by browser and platform_os, and version each split as a new version of the data in dvc.

Split data by browser

In [45]:
df.groupby('browser').count()

Unnamed: 0_level_0,auction_id,experiment,date,hour,device_make,platform_os,yes,no
browser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Android,1,1,1,1,1,1,1,1
Chrome,3,3,3,3,3,3,3,3
Chrome Mobile,4554,4554,4554,4554,4554,4554,4554,4554
Chrome Mobile WebView,1489,1489,1489,1489,1489,1489,1489,1489
Chrome Mobile iOS,51,51,51,51,51,51,51,51
Edge Mobile,1,1,1,1,1,1,1,1
Facebook,764,764,764,764,764,764,764,764
Firefox Mobile,1,1,1,1,1,1,1,1
Mobile Safari,337,337,337,337,337,337,337,337
Mobile Safari UI/WKWebView,44,44,44,44,44,44,44,44


In [46]:
#Group by browser and store them on in a dictionary
by_browser = df.groupby('browser')
browsers = by_browser['experiment'].count() > 100
browsers = browsers[browsers==True].index
browser_dfs = {}
for browser in browsers:
    b_df = df.query(f"browser=='{browser}'").reset_index(drop=True)
    browser_dfs[browser] = b_df

In [47]:
browser_dfs.keys()

dict_keys(['Chrome Mobile', 'Chrome Mobile WebView', 'Facebook', 'Mobile Safari', 'Samsung Internet'])

In [48]:
#create version 2 of the data for chrome mobile
b_df=browser_dfs.get('Chrome Mobile')
b_df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
3,002e308b-1a07-49d6-8560-0fbcdcd71e4b,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
4,004c4cc9-f2ca-4df7-adc9-3d0c3c4f0342,control,2020-07-05,14,Generic Smartphone,6,Chrome Mobile,0,0


In [49]:
# creating response column by removing yes and no
df1 = response[response['yes'] == 1].drop(['yes','no'], axis = 1)
df1['response'] = 1
df2 = response[response['no'] == 1].drop(['yes','no'], axis = 1)
df2['response'] = 0
responsee = df1.append(df2).reset_index(drop=True)
responsee.head(5)

NameError: name 'response' is not defined

In [17]:
# use the data where respondents are not neutral
df_browser= df[['experiment','date','hour','device_make','browser','yes','no']]
df_browser.query('not (yes == 0 & no == 0)',inplace=True)


In [18]:
df_browser

Unnamed: 0,experiment,date,hour,device_make,browser,yes,no
2,exposed,2020-07-05,2,E5823,Chrome Mobile WebView,0,1
16,exposed,2020-07-04,16,Generic Smartphone,Chrome Mobile,1,0
20,exposed,2020-07-06,8,Generic Smartphone,Chrome Mobile,0,1
23,control,2020-07-08,4,Samsung SM-A202F,Facebook,1,0
27,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,1
...,...,...,...,...,...,...,...
8059,exposed,2020-07-05,21,Generic Smartphone,Chrome Mobile,1,0
8063,exposed,2020-07-04,1,Generic Smartphone,Chrome Mobile,1,0
8064,control,2020-07-09,7,Generic Smartphone,Chrome Mobile,0,1
8069,control,2020-07-10,16,Generic Smartphone,Chrome Mobile,0,1


In [19]:
# convert the data and experiment to numerical 
df_browser_non_num=df_browser.select_dtypes(include=['object'])
df_browser_non_num['date_1']= pd.to_datetime(df_browser_non_num['date'])
df_browser_non_num['experiment_1'] = df_browser_non_num['experiment'].map({'exposed': 1, 
                                    'control': 0})
df_browser_non_num['experiment'].value_counts()

exposed    657
control    586
Name: experiment, dtype: int64

In [20]:
df_browser_non_num.drop(['experiment',	'date'], axis=1, inplace=True)

In [21]:
categorical_column= df_browser_non_num.select_dtypes(include='object').columns.tolist()
categorical_column

['device_make', 'browser']

In [22]:
# label_encoder  device_maker and browser
label_encoder=LabelEncoder()

# Encode labels in column 'device_make'.
df_browser_non_num['device_make']= label_encoder.fit_transform(df_browser_non_num['device_make'])
df_browser_non_num['browser']= label_encoder.fit_transform(df_browser_non_num['browser'])
# rename columns
df_browser_non_num=df_browser_non_num.rename({'date_1': 'date',
                         'experiment_1': 'experiment'}, axis='columns')
df_browser_non_num.reset_index(drop=True)

Unnamed: 0,device_make,browser,date,experiment
0,4,2,2020-07-05,1
1,13,1,2020-07-04,1
2,13,1,2020-07-06,1
3,43,4,2020-07-08,0
4,13,1,2020-07-03,0
...,...,...,...,...
1238,13,1,2020-07-05,1
1239,13,1,2020-07-04,1
1240,13,1,2020-07-09,0
1241,13,1,2020-07-10,0


In [23]:
df_browser_non_num[['hour','yes','no']]= df_browser[['hour','yes','no']]
df_browser_non_num
# rename the columns
columns=[ 'date', 'hour','device_make', 'browser','experiment','yes',	'no']
df_browser_non_num= df_browser_non_num[columns]
df_browser_non_num
df_browser_non_num.reset_index(drop=True)

Unnamed: 0,date,hour,device_make,browser,experiment,yes,no
0,2020-07-05,2,4,2,1,0,1
1,2020-07-04,16,13,1,1,1,0
2,2020-07-06,8,13,1,1,0,1
3,2020-07-08,4,43,4,0,1,0
4,2020-07-03,15,13,1,0,0,1
...,...,...,...,...,...,...,...
1238,2020-07-05,21,13,1,1,1,0
1239,2020-07-04,1,13,1,1,1,0
1240,2020-07-09,7,13,1,0,0,1
1241,2020-07-10,16,13,1,0,0,1


In [24]:
df_browser_non_num.isnull().sum()

date           0
hour           0
device_make    0
browser        0
experiment     0
yes            0
no             0
dtype: int64

Save the dataset

In [25]:
# '../data/AdSmartABdata.csv'
df_browser_non_num.to_csv('../data/AdSmartABdata_browser.csv')

Split data by platform_os

In [26]:
df_platform= df[['experiment','date','hour','device_make','platform_os','yes','no']]
df_platform.query('not (yes == 0 & no == 0)',inplace=True)

In [27]:
df_platform

Unnamed: 0,experiment,date,hour,device_make,platform_os,yes,no
2,exposed,2020-07-05,2,E5823,6,0,1
16,exposed,2020-07-04,16,Generic Smartphone,6,1,0
20,exposed,2020-07-06,8,Generic Smartphone,6,0,1
23,control,2020-07-08,4,Samsung SM-A202F,6,1,0
27,control,2020-07-03,15,Generic Smartphone,6,0,1
...,...,...,...,...,...,...,...
8059,exposed,2020-07-05,21,Generic Smartphone,6,1,0
8063,exposed,2020-07-04,1,Generic Smartphone,6,1,0
8064,control,2020-07-09,7,Generic Smartphone,6,0,1
8069,control,2020-07-10,16,Generic Smartphone,6,0,1


- label encoding

In [28]:
# convert the data and experiment to numerical 
df_platform_non_num=df_platform.select_dtypes(include=['object'])
df_platform_non_num['date_1']= pd.to_datetime(df_platform_non_num['date'])
df_platform_non_num['experiment_1'] = df_platform_non_num['experiment'].map({'exposed': 1, 
                                    'control': 0})
# df_platform_non_num['experiment'].value_counts()

# df_platform_non_num=df_platform_non_num.reset_index(drop=True)
df_platform_non_num.drop(['date','experiment' ],axis=1,inplace=True)

In [29]:
# Encode labels in column 'device_make'.
df_platform_non_num['device_make']= label_encoder.fit_transform(df_platform_non_num['device_make'])
df_platform_non_num['platform_os']=df_platform['platform_os']

In [30]:
df_platform_non_num

Unnamed: 0,device_make,date_1,experiment_1,platform_os
2,4,2020-07-05,1,6
16,13,2020-07-04,1,6
20,13,2020-07-06,1,6
23,43,2020-07-08,0,6
27,13,2020-07-03,0,6
...,...,...,...,...
8059,13,2020-07-05,1,6
8063,13,2020-07-04,1,6
8064,13,2020-07-09,0,6
8069,13,2020-07-10,0,6


In [31]:
df_platform_non_num[['hour','yes','no']]= df_browser[['hour','yes','no']]
df_platform_non_num
# rename the columns
columns=[ 'date_1', 'hour','device_make', 'platform_os','experiment_1','yes',	'no']
df_platform_non_num= df_platform_non_num[columns]
df_platform_non_num=df_platform_non_num.rename({'date_1': 'date',
                         'experiment_1': 'experiment'}, axis='columns')
df_platform_non_num.reset_index(drop=True)

Unnamed: 0,date,hour,device_make,platform_os,experiment,yes,no
0,2020-07-05,2,4,6,1,0,1
1,2020-07-04,16,13,6,1,1,0
2,2020-07-06,8,13,6,1,0,1
3,2020-07-08,4,43,6,0,1,0
4,2020-07-03,15,13,6,0,0,1
...,...,...,...,...,...,...,...
1238,2020-07-05,21,13,6,1,1,0
1239,2020-07-04,1,13,6,1,1,0
1240,2020-07-09,7,13,6,0,0,1
1241,2020-07-10,16,13,6,0,0,1


In [32]:
df_platform_non_num

Unnamed: 0,date,hour,device_make,platform_os,experiment,yes,no
2,2020-07-05,2,4,6,1,0,1
16,2020-07-04,16,13,6,1,1,0
20,2020-07-06,8,13,6,1,0,1
23,2020-07-08,4,43,6,0,1,0
27,2020-07-03,15,13,6,0,0,1
...,...,...,...,...,...,...,...
8059,2020-07-05,21,13,6,1,1,0
8063,2020-07-04,1,13,6,1,1,0
8064,2020-07-09,7,13,6,0,0,1
8069,2020-07-10,16,13,6,0,0,1


In [33]:
# '../data/AdSmartABdata.csv'
df_platform_non_num.to_csv('../data/AdSmartABdata_platform.csv')

<b>`For each version of the data do the following:`</b> 
- Split the data into 70% training, 20% validation, and 10% test sets. 
- Based on the reading material provided, apply machine learning to the training data. 
- Train a machine learning model using 5-fold cross validation using the following 3 different algorithms:
- Logistic Regression 
- Decision Trees
- XGBoost
- RandomForest


<b>` Split the data into 70% training, 20% validation, and 10% test sets.`</b>   

In [34]:
df_platform_non_num.columns

X= df_platform_non_num[['hour', 'device_make', 'platform_os', 'experiment']]
X

Unnamed: 0,hour,device_make,platform_os,experiment
2,2,4,6,1
16,16,13,6,1
20,8,13,6,1
23,4,43,6,0
27,15,13,6,0
...,...,...,...,...
8059,21,13,6,1
8063,1,13,6,1
8064,7,13,6,0
8069,16,13,6,0


In [35]:
# Define Y (This is the value we will predict)
y = df["yes"]
y

0       0
1       0
2       0
3       0
4       0
       ..
8072    0
8073    0
8074    0
8075    0
8076    0
Name: yes, Length: 8077, dtype: int64

<b>`Now is the time to split the data to train and test sets`<b>
- Split the data into 70% training, 20% validation, and 10% test sets.

In [36]:
# TO DOO
# Include the validation part

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

ValueError: Found input variables with inconsistent numbers of samples: [1243, 8077]

## Training the models

* Random Forest -> Supervised Learning – Classification/Regression
> Random forests or ‘random decision forests’ is an ensemble learning method, combining multiple algorithms to generate better results for classification, regression and other tasks. Each individual classifier is weak, but when combined with others, can produce excellent results. It can handle both continous and categorical variables.
* Desicion Tree -> Supervised Learning – Classification/Regression
> A decision tree is a flow-chart-like tree structure that uses a branching method to illustrate every possible outcome of a decision. Each node within the tree represents a test on a specific variable – and each branch is the outcome of that test.

* Logistic Regression Classifier -> Supervised learning – Classification
> Logistic regression focuses on estimating the probability of an event occurring based on the previous data provided. It is used to cover a binary dependent variable, that is where only two values, 0 and 1, represent outcomes. - Uses Sigmoid in the output layer in binary classification.

* Bernouilli Naive Bias
* Gaussian Naive Bias 
> The Naïve Bayes classifier is based on Bayes’ theorem and classifies every value as independent of any other value. It allows us to predict a class/category, based on a given set of features, using probability.


* KNN (K-Nearest Neighbors) -> Supervised Learning
> The K-Nearest-Neighbour algorithm calculates the likelihood that a data point belongs to a particular group or not. To decide which group a certain data point actually belongs to, it essentially examines the data points around that point.
* XGBoost (have acurate predictions)
>  It is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework.

Random Forest

In [37]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

NameError: name 'X_train' is not defined

Decision Tree

In [None]:
# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])

Logistic Regression

In [None]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Logistic Regression

In [None]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])

XGBoost

In [None]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])