In [None]:
# mount drive
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sbn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import math # for logging in scaling
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# create path to training data and import data set
path = "/content/gdrive/My Drive/MLProject"
df = pd.read_csv(path + "/train.csv")


In [None]:
# look at the data. Descriptives of the raw data
df.dtypes
df.describe()

In [None]:
# change timestamp into datetime and add month, hour, time_of_day, season to the features

df['Timestamp'] = df['Timestamp'].apply(pd.to_datetime)
df['month'] = df['Timestamp'].dt.month
df['hour'] = df['Timestamp'].dt.hour

## Create the time_of_day by cutting
df['time_of_day'] = pd.cut(df['hour'], bins = [-1, 9, 16, 24],labels = ['morning', 'afternoon', 'evening'],  right = True)

# cut months into seasons
df['season']= pd.cut(x = df['month'],bins = [0, 3, 6], right  = True, labels = ['winter', 'spring'])

In [None]:
# im gonna make some exploratory plots.
# barplots
# heat maps (grid plots)
# histograms
# boxplots
# scatterplots



In [None]:
# barplots
# group by
times  = df.groupby(['time_of_day'], as_index = False).count()[['time_of_day', 'id']]
sbn.catplot(data = times, x = 'time_of_day', y = 'id', hue = 'time_of_day', kind = 'bar')

# group season and time
seas = df.groupby(['season', 'time_of_day'], as_index = False).count()[['season', 'time_of_day', 'id']]
seas = seas.pivot("time_of_day", "season", "id")
sbn.heatmap(data = seas)
#seas

In [None]:
# boxplot Time spent vs gender
sbn.boxplot(data = df, x = 'gender', y = 'Daily Time Spent on Site', color= "red")

In [None]:
# boxplot Time spent vs Clicked
sbn.boxplot(data = df, x = 'Clicked', y = 'Daily Time Spent on Site', color = 'blue')

In [None]:
# histogram
sbn.histplot(data = df, x = 'Area Income', hue = 'Clicked', kde = True)


In [None]:
sbn.histplot(data = df, x = 'Daily Time Spent on Site', kde = True, hue = 'Clicked')

In [None]:
country_counts = pd.DataFrame(df['Country'].value_counts())
country_counts  = country_counts.rename(columns = {'Country':'count'})

country_counts

# How to extract counts of values
# country_counts.loc['Ghana', 'count']

# 'Ghana' in country_counts.index

In [None]:
# Checking for missing values

index_ = df[df['Area Income'].isna()].index
index_

# countries included
country_inc = df[['Country', 'Area Income']].groupby('Country', as_index = False).mean()
[len(index_), len(country_inc)]

# Overall mean of Area Income for training data
train_mean_area_income = df['Area Income'].mean()
train_mean_area_income

55343.01455483878

In [None]:
# fixing missing values
for ind in index_:
  country = df.loc[ind]['Country']
  if int(country_counts.loc[country]) >= 2:
    mean_val = float(country_inc[country_inc['Country'] == country]['Area Income'])
    df.loc[ind, 'Area Income'] = mean_val
  elif int(country_counts.loc[country]) == 1 or country not in country_counts.index:
    df.loc[ind, 'Area Income'] = train_mean_area_income

In [None]:
df.isnull().sum()
df.shape

(1000, 14)

Im gonna fit a decision tree to the df

In [None]:
#Use only the complete case
df_clean = df[~df['Area Income'].isnull()]
df_clean['time_of_day']
df_clean.shape

(1000, 14)

In [None]:
# label encode 'time_of_day'

# create instance of labelencoder
labelencoder = LabelEncoder()
df_clean['tod'] = labelencoder.fit_transform(df_clean['time_of_day'])
df_clean.head(10)


In [None]:
# label encode season
labelencoder = LabelEncoder()
df_clean['seasons'] = labelencoder.fit_transform(df_clean['season'])
df_clean.head(10)

In [None]:
# Now I can onehotencode seasons and tod
enc = OneHotEncoder(handle_unknown = 'ignore')

enc_df = pd.DataFrame(enc.fit_transform(df_clean[['seasons']]).toarray(), columns = ['Spring', 'Winter'])
df_clean = df_clean.join(enc_df)
df_clean.head(10)

In [None]:
# Now OneHotEncode the tod
enc = OneHotEncoder(handle_unknown = 'ignore')
enc_df = pd.DataFrame(enc.fit_transform(df_clean[['tod']]).toarray(), columns = ['Afternoon', 'Evening', 'Morning'])
df_clean = df_clean.join(enc_df)
df_clean.head(10)

In [None]:
# onehotencode gender
enc = OneHotEncoder(handle_unknown = 'ignore')
enc_df = pd.DataFrame(enc.fit_transform(df_clean[['gender']]).toarray(), columns = ['female', 'male'])
enc_df.head(10)

df_clean = df_clean.join(enc_df)
df_clean = df_clean.drop(['gender'], axis = 1)
df_clean.head(10)

In [None]:
# 
df_clean.dtypes

In [None]:
df_clean = df_clean.drop(['id', 'Timestamp', 'time_of_day', 'Country', 'season', 'seasons', 'tod', 'Ad Topic Line', 'month', 'hour'], 
                         axis = 1)
df_clean.head(10)

In [None]:
# Scaling 
df_clean.describe()



In [None]:
# no need to scale in tree-based algorithmns. Only needed in distance based algorithmns

# so I split the data into output and input features
Y = df_clean['Clicked']
X = df_clean.drop('Clicked', axis = 1) 

# split the data into training and test(validation) sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)


In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


((700, 11), (300, 11), (700,), (300,))

In [None]:
# Train the model now.
# a. Model Training using gini index

classify_gini = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 3, min_samples_leaf = 5) # create classifier object
classify_gini.fit(X_train, Y_train) #Train the model
classify_gini


# b. Train model with entropy criterion
classify_entropy = DecisionTreeClassifier(criterion = 'entropy', random_state = 100, max_depth = 3, min_samples_leaf = 5)
classify_entropy.fit(X_train, Y_train)
classify_entropy

In [None]:
# now let us make predictions

y_pred_gini = classify_gini.predict(X_test)
y_pred_entropy = classify_entropy.predict(X_test)

In [None]:
# accuracy of models
# conf matrix
confusion_matrix(Y_test, y_pred_gini) 

array([[107,  51],
       [ 11, 131]])

In [None]:
accuracy_score(Y_test, y_pred_gini)*100

79.33333333333333

In [None]:
accuracy_score(Y_test, y_pred_entropy)*100

79.66666666666666

Refit the model from the preprocessed cleaned data

In [None]:
#######################################################################################

In [None]:
# Load libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder 

In [None]:
# labelencode and # onehotencode
#labelencode time_of_day and season

labelencoder = LabelEncoder()
df['time_of_day'] = labelencoder.fit_transform(df['time_of_day'])
df['season'] = labelencoder.fit_transform(df['season'])


#onehotencode time of day, season, and gender

henc = OneHotEncoder(handle_unknown = 'ignore')
enc_tod_df = pd.DataFrame(henc.fit_transform(df[['time_of_day']]).toarray(), columns = ['afternoon', 'evening', 'morning'])
enc_season_df = pd.DataFrame(henc.fit_transform(df[['season']]).toarray(), columns = ['spring', 'winter'])
enc_gender_df = pd.DataFrame(henc.fit_transform(df[['gender']]).toarray(), columns = ['female', 'male'])




In [None]:
# add the onehotencoded subdataframes to the original dataframe

df = df.join(enc_tod_df.join(enc_season_df.join(enc_gender_df)))

In [None]:
# Drop unwanted features (or columns)

df = df.drop(['id', 'Timestamp', 'time_of_day', 'season', 'Ad Topic Line' , 'gender', 'Country', 'month', 'hour'], axis = 1)
df.head(10)

In [None]:
df.dtypes

In [None]:
# fit the model
# load the libraries
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [None]:
# split the data into traiining and validation sets

X = df.drop(['Clicked'], axis = 1)
Y = df['Clicked']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 50)


In [None]:
# set instances
clf_gini = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_leaf = 5, random_state = 50)
clf_entropy = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_leaf = 5, random_state = 50)

# train models
clf_gini.fit(X_train, Y_train)
clf_entropy.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=50, splitter='best')

In [None]:
# make predictions on test set and compute the confusion matrix
y_pred_gini = clf_gini.predict(X_test)
y_pred_entropy = clf_entropy.predict(X_test)

In [None]:
confusion_matrix(Y_test, y_pred_gini)

array([[131,  21],
       [ 28, 120]])

In [None]:
confusion_matrix(Y_test, y_pred_entropy)

array([[131,  21],
       [ 28, 120]])

In [None]:
accuracy_score(Y_test, y_pred_gini)*100

83.66666666666667

In [None]:
accuracy_score(Y_test, y_pred_entropy)*100

83.66666666666667

In [None]:
print(classification_report(Y_test, y_pred_gini))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       152
           1       0.85      0.81      0.83       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300



In [None]:
###################################################################################################################################


In [None]:
# Fit a random forest
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [None]:
# split the data into training and test (validation) sets.
X = df.drop(['Clicked'], axis = 1)
y = df['Clicked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 55)

In [None]:
# fit the randomforest model
clf = RandomForestClassifier(n_estimators = 20)
clf.fit(X_train, y_train)

# Prediction
y_pred = clf.predict(X_test)
y_pred

print(metrics.confusion_matrix(y_pred, y_test))
metrics.accuracy_score(y_test, y_pred)


[[125  20]
 [ 18 137]]


0.8733333333333333

In [None]:
# calculating feature importance
feat= pd.DataFrame(clf.feature_importances_, columns = ['Importance'])
col = pd.DataFrame(df.columns, columns = ['Features'])
feat_col = col.join(feat)
feat_col.sort_values(by = ['Importance'], ascending = False)

In [None]:
###########################################################################################################

In [None]:
############# Bagging by decision trees ############
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
clf = BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 5, criterion = 'gini'), 
                        n_estimators = 1000, random_state = 0).fit(X_train, y_train)

In [None]:
y_test = clf.predict(X_test)
confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)*100

92.33333333333333

In [None]:
#######################################################################
# fit a bagging classifier with decision tree

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [None]:
X = df.drop(['Clicked'], axis = 1)
y = df['Clicked']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 50)

In [None]:
clf = BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 5), n_estimators = 100).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred, y_test)*100)



[[129  17]
 [ 23 131]]
86.66666666666667


In [None]:
# extract feature
for i in [0,2,3]:
    if i%2 == 0:
        print("even")
    elif i == 0:
        print("zero")
    else:
        print("odd")

even
even
odd
