In [1]:
import dvc.api
import numpy as np
import pandas as pd

In [2]:
# Image Disp
from IPython.display import Image

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [24]:
df = pd.read_csv("../data1/browser.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,experiment,hour,date,device_make,browser,awareness
0,0,0,1,2,2,4,2,0
1,1,1,1,16,1,13,1,1
2,2,2,1,8,3,13,1,0
3,3,3,0,4,5,43,4,1
4,4,4,0,15,0,13,1,0
...,...,...,...,...,...,...,...,...
1238,1238,1238,1,21,2,13,1,1
1239,1239,1239,1,1,1,13,1,1
1240,1240,1240,0,7,6,13,1,0
1241,1241,1241,0,16,7,13,1,0


In [26]:
# Define Y (This is the value we will predict)
y = df["awareness"]

# Droping "class" from X
X = df.drop(columns = ['awareness']).copy()
X

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,experiment,hour,date,device_make,browser
0,0,0,1,2,2,4,2
1,1,1,1,16,1,13,1
2,2,2,1,8,3,13,1
3,3,3,0,4,5,43,4
4,4,4,0,15,0,13,1
...,...,...,...,...,...,...,...
1238,1238,1238,1,21,2,13,1
1239,1239,1239,1,1,1,13,1
1240,1240,1240,0,7,6,13,1
1241,1241,1241,0,16,7,13,1


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

Logistic Regression

In [30]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 0 0 0 0 1 0 0 1 1]
Actual: 808     0
980     0
383     1
1115    0
624     0
34      1
741     0
667     0
373     0
458     0
Name: awareness, dtype: int64


XGBoost

In [31]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])



Predicted: [0 0 0 0 0 1 1 0 0 0]
Actual: 808     0
980     0
383     1
1115    0
624     0
34      1
741     0
667     0
373     0
458     0
Name: awareness, dtype: int64


In [32]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 0 0 0 0 1 1 0 1 0]
Actual: 808     0
980     0
383     1
1115    0
624     0
34      1
741     0
667     0
373     0
458     0
Name: awareness, dtype: int64


compare results from the three modeling algorithms

In [34]:
# Logistic Regression
cm_log = confusion_matrix(y_test, pred_log)
# XGBoost 
cm_xgb = confusion_matrix(y_test, pred_xgb)
# Random Forest 
cm_rf = confusion_matrix(y_test, pred_rf)


print("Confusion Matrixes")
print("Logistic Regression:\n", cm_log)
print("XGBoost:\n", cm_xgb)
print("Random Forest:\n", cm_rf)

Confusion Matrixes
Logistic Regression:
 [[237 245]
 [217 172]]
XGBoost:
 [[303 179]
 [236 153]]
Random Forest:
 [[259 223]
 [189 200]]


Accuracy score

In [35]:
# Logistic Regression
acc_log = accuracy_score(y_test, pred_log)
# XGBoost
acc_xgb = accuracy_score(y_test, pred_xgb)
# Random Forest 
acc_rf = accuracy_score(y_test, pred_rf)

print("Accuracy Scores")
print("Logistic Regression:", acc_log)
print("XGBoost:", acc_xgb)
print("Random Forest:", acc_rf)

Accuracy Scores
Logistic Regression: 0.4695752009184845
XGBoost: 0.5235361653272101
Random Forest: 0.5269804822043628


In [39]:
import pickle

In [40]:
pickle.dump(acc_log, open("../data1/log.pkl", 'wb'))

In [41]:
pickle.dump(acc_xgb, open("../data1/xgb.pkl", 'wb'))

In [42]:
pickle.dump(acc_rf, open("../data1/random_forest.pkl", 'wb'))