In [1]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

# Image Disp
from IPython.display import Image

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score


# We are importing our Data with Pandas Library
# We use "Coronary_artery.csv" 

In [5]:
df = pd.read_csv(r"C:\Users\Genet Shanko\SmartAd_A-B_Testing_user_analysis\data\AdSmartABdata.csv")

In [6]:
df.head()


Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [7]:
print("Rows:", len(df))

Rows: 8077


In [8]:
# Prints Summary of Numerical Data
df.describe()

Unnamed: 0,hour,platform_os,yes,no
count,8077.0,8077.0,8077.0,8077.0
mean,11.61508,5.947134,0.070818,0.083075
std,5.734879,0.224333,0.256537,0.276013
min,0.0,5.0,0.0,0.0
25%,7.0,6.0,0.0,0.0
50%,13.0,6.0,0.0,0.0
75%,15.0,6.0,0.0,0.0
max,23.0,7.0,1.0,1.0


In [9]:
# Prints Summary of Categorical Data
df.describe(include=[np.object])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df.describe(include=[np.object])


Unnamed: 0,auction_id,experiment,date,device_make,browser
count,8077,8077,8077,8077,8077
unique,8077,2,8,270,15
top,0008ef63-77a7-448b-bd1e-075f42c55e39,control,2020-07-03,Generic Smartphone,Chrome Mobile
freq,1,4071,2015,4743,4554


In [10]:
numerical_column = df.select_dtypes(exclude="object").columns.tolist()
categorical_column = df.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

Numerical Columns: ['hour', 'platform_os', 'yes', 'no']
****************
Categorical Columns: ['auction_id', 'experiment', 'date', 'device_make', 'browser']


In [11]:
df.isnull().sum()

auction_id     0
experiment     0
date           0
hour           0
device_make    0
platform_os    0
browser        0
yes            0
no             0
dtype: int64

In [13]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if df[col].nunique() <= 10 and df[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)

To One Hot Encoding: ['date']
To Label Encoding: ['auction_id', 'experiment', 'device_make', 'browser']


In [14]:
one_hot_encoded_columns = pd.get_dummies(df[to_one_hot_encoding])
one_hot_encoded_columns

Unnamed: 0,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10
0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
8072,0,0,1,0,0,0,0,0
8073,1,0,0,0,0,0,0,0
8074,0,1,0,0,0,0,0,0
8075,0,0,1,0,0,0,0,0


In [15]:
# Label Encoding

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,auction_id,experiment,device_make,browser
0,0,1,46,2
1,1,1,46,2
2,2,1,29,3
3,3,0,137,6
4,4,0,46,2
...,...,...,...,...
8072,8072,1,46,2
8073,8073,0,46,2
8074,8074,0,46,2
8075,8075,1,130,14


In [16]:
X = df.copy()

# Droping Categorical Columns,
# "inplace" means replace our data with new one
# Don't forget to "axis=1"
X.drop(categorical_column, axis=1, inplace=True)

# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns, label_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['hour', 'platform_os', 'yes', 'no', 'date_2020-07-03', 'date_2020-07-04', 'date_2020-07-05', 'date_2020-07-06', 'date_2020-07-07', 'date_2020-07-08', 'date_2020-07-09', 'date_2020-07-10', 'auction_id', 'experiment', 'device_make', 'browser']


Unnamed: 0,hour,platform_os,yes,no,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10,auction_id,experiment,device_make,browser
0,8,6,0,0,0,0,0,0,0,0,0,1,0,1,46,2
1,10,6,0,0,0,0,0,0,1,0,0,0,1,1,46,2
2,2,6,0,1,0,0,1,0,0,0,0,0,2,1,29,3
3,15,6,0,0,1,0,0,0,0,0,0,0,3,0,137,6
4,15,6,0,0,1,0,0,0,0,0,0,0,4,0,46,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8072,7,6,0,0,0,0,1,0,0,0,0,0,8072,1,46,2
8073,15,6,0,0,1,0,0,0,0,0,0,0,8073,0,46,2
8074,9,6,0,0,0,1,0,0,0,0,0,0,8074,0,46,2
8075,15,6,0,0,0,0,1,0,0,0,0,0,8075,1,130,14


In [33]:
# Define Y (This is the value we will predict)
y = df["browser"]

# Droping "class" from X
# Define Decision Tree Model
# Droping "class" from X
X.drop(["browser"], axis=1, inplace=True)
X


Unnamed: 0,hour,platform_os,yes,no,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10,auction_id,device_make
0,8,6,0,0,0,0,0,0,0,0,0,1,0,46
1,10,6,0,0,0,0,0,0,1,0,0,0,1,46
2,2,6,0,1,0,0,1,0,0,0,0,0,2,29
3,15,6,0,0,1,0,0,0,0,0,0,0,3,137
4,15,6,0,0,1,0,0,0,0,0,0,0,4,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8072,7,6,0,0,0,0,1,0,0,0,0,0,8072,46
8073,15,6,0,0,1,0,0,0,0,0,0,0,8073,46
8074,9,6,0,0,0,1,0,0,0,0,0,0,8074,46
8075,15,6,0,0,0,0,1,0,0,0,0,0,8075,130


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [35]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

Predicted: ['Chrome Mobile' 'Chrome Mobile WebView' 'Chrome Mobile WebView'
 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile'
 'Chrome Mobile WebView' 'Samsung Internet' 'Chrome Mobile']
Actual: 6261            Chrome Mobile
5908                 Facebook
2819    Chrome Mobile WebView
3530            Chrome Mobile
3591            Chrome Mobile
3866            Chrome Mobile
5100            Chrome Mobile
7496    Chrome Mobile WebView
4311    Chrome Mobile WebView
1648            Chrome Mobile
Name: browser, dtype: object


In [36]:
# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])

Predicted: ['Chrome Mobile' 'Chrome Mobile WebView' 'Facebook' 'Chrome Mobile'
 'Chrome Mobile' 'Facebook' 'Chrome Mobile' 'Chrome Mobile WebView'
 'Samsung Internet' 'Chrome Mobile']
Actual: 6261            Chrome Mobile
5908                 Facebook
2819    Chrome Mobile WebView
3530            Chrome Mobile
3591            Chrome Mobile
3866            Chrome Mobile
5100            Chrome Mobile
7496    Chrome Mobile WebView
4311    Chrome Mobile WebView
1648            Chrome Mobile
Name: browser, dtype: object


In [37]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted: ['Chrome Mobile' 'Chrome Mobile WebView' 'Samsung Internet'
 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile'
 'Chrome Mobile' 'Chrome Mobile WebView' 'Chrome Mobile']
Actual: 6261            Chrome Mobile
5908                 Facebook
2819    Chrome Mobile WebView
3530            Chrome Mobile
3591            Chrome Mobile
3866            Chrome Mobile
5100            Chrome Mobile
7496    Chrome Mobile WebView
4311    Chrome Mobile WebView
1648            Chrome Mobile
Name: browser, dtype: object


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# Define Bernouilli Naive Bias Model
bnb = BernoulliNB()
# We fit our model with our train data
bnb.fit(X_train, y_train)
# Then predict results from X_test data
pred_bnb = bnb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_bnb[0:10])
print("Actual:", y_test[0:10])

Predicted: ['Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile'
 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile' 'Chrome Mobile'
 'Chrome Mobile' 'Chrome Mobile']
Actual: 6261            Chrome Mobile
5908                 Facebook
2819    Chrome Mobile WebView
3530            Chrome Mobile
3591            Chrome Mobile
3866            Chrome Mobile
5100            Chrome Mobile
7496    Chrome Mobile WebView
4311    Chrome Mobile WebView
1648            Chrome Mobile
Name: browser, dtype: object


In [39]:
# Define Gaussian Naive Bias Model
gnb = GaussianNB()
# We fit our model with our train data
gnb.fit(X_train, y_train)
# Then predict results from X_test data
pred_gnb = gnb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_gnb[0:10])
print("Actual:", y_test[0:10])

Predicted: ['Chrome Mobile' 'Facebook' 'Chrome Mobile WebView' 'Chrome Mobile'
 'Chrome Mobile' 'Pinterest' 'Chrome Mobile' 'Chrome Mobile' 'Pinterest'
 'Chrome Mobile']
Actual: 6261            Chrome Mobile
5908                 Facebook
2819    Chrome Mobile WebView
3530            Chrome Mobile
3591            Chrome Mobile
3866            Chrome Mobile
5100            Chrome Mobile
7496    Chrome Mobile WebView
4311    Chrome Mobile WebView
1648            Chrome Mobile
Name: browser, dtype: object
