In [1]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have 
from sklearn.metrics import accuracy_score, confusion_matrix,mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [2]:
sys.path.append('../Scripts')
from eda_Read_data import ReadData
loader = ReadData()
data= loader.read_csv('../Data/AdSmartABdata.csv')
data.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


### Data Preprocessing for modelling

In [9]:
from eda_cleaning_data import Cleaning_excel_data
cleaner = Cleaning_excel_data()
clean_data= cleaner.drop_rows(data, 'yes', 'no')
clean_data.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1


In [10]:
## generating a day of the week column from the date
clean_data['date'] = pd.to_datetime(df['date'], errors='coerce')
clean_data['day_of_week'] = clean_data['date'].dt.day_name()


#drop the no column
clean_data.drop(['no'], axis=1,inplace=True)
clean_data.rename(columns={'yes': 'result'},inplace=True)

#Combining the control and exposed group 
exposed_group = clean_data[clean_data['experiment'] == 'exposed']
control_group = clean_data[clean_data['experiment'] == 'control']

#dropping nulls and indexing the data
clean_data['row_id'] = clean_data.index
clean_data.dropna(inplace=True)
clean_data = shuffle(clean_data)

#encoding the data in categorical data into 0s and 1s
lb = LabelEncoder()
clean_data['experiment'] = lb.fit_transform(clean_data['experiment'])
clean_data['browser'] = lb.fit_transform(clean_data['browser'])
clean_data['device_make'] = lb.fit_transform(clean_data['device_make'])
clean_data['day_of_week'] = lb.fit_transform(clean_data['day_of_week'])

#eliminating nwanted colmns
clean_data = clean_data[['row_id', 'device_make', 'platform_os', 'browser' ,'day_of_week', 'experiment','result']]


clean_data.head()

Unnamed: 0,row_id,device_make,platform_os,browser,day_of_week,experiment,result
5221,5221,41,6,7,0,0,1
6053,6053,43,6,2,4,0,1
5531,5531,47,6,7,0,0,1
6171,6171,13,6,1,0,0,0
4719,4719,47,6,2,6,1,0


In [12]:
browser_groups= clean_data.groupby('browser')
#exposed_dropped_group= experiment_dropped_groups.get_group('exposed')
browser_groups.head()

Unnamed: 0,row_id,device_make,platform_os,browser,day_of_week,experiment,result
5221,5221,41,6,7,0,0,1
6053,6053,43,6,2,4,0,1
5531,5531,47,6,7,0,0,1
6171,6171,13,6,1,0,0,0
4719,4719,47,6,2,6,1,0
5624,5624,13,6,1,2,1,1
3854,3854,13,6,1,0,1,0
2347,2347,13,6,1,0,0,0
5014,5014,69,6,2,2,1,1
6297,6297,64,6,7,5,0,1


In [17]:
clean_data.shape

(1243, 7)

In [14]:
# set aside 20% of train and test data for evaluation
X_train, X_test, y_train, y_test = train_test_split(clean_data.loc[:, clean_data.columns != 'result'], clean_data['result'],
    test_size=0.1)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.2) # 0.25 x 0.8 = 0.2

In [15]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(894, 6)
(125, 6)
(224, 6)
(894,)
(125,)
(224,)


## Logistic Regression

In [18]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 0 0 0 0 0 1 0 0 0]
Actual: 6250    1
3372    0
3722    1
5579    0
3473    1
5831    0
6836    1
4901    0
2791    0
7375    1
Name: result, dtype: int64


### Decision Tree

In [19]:
# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and Their Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 1 1 1 0 0 1 0 0 0]
Actual: 6250    1
3372    0
3722    1
5579    0
3473    1
5831    0
6836    1
4901    0
2791    0
7375    1
Name: result, dtype: int64


### XGBoost

In [20]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
# print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])



Actual: 6250    1
3372    0
3722    1
5579    0
3473    1
5831    0
6836    1
4901    0
2791    0
7375    1
Name: result, dtype: int64


In [24]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(clean_data.loc[:, clean_data.columns != 'result'], clean_data['result'], train_size=0.7,test_size=0.1, random_state=101)

In [25]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(870, 6)
(125, 6)
(224, 6)
(870,)
(125,)
(224,)
