In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_validate
import seaborn as sns
import matplotlib.pyplot as plt

myData=pd.read_csv('imputedData.csv')
myData.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [2]:
#One hot encoding of categorical variables

#Create list with features to be dummified cols.
nonum_feats_names = ['Month','OperatingSystems','Browser','Region','VisitorType','Weekend']


#Boolean to dummify logic
#Weekend_map={False:0,True:1}
#pd.get_dummies(myData['Weekend'].map(Weekend_map).astype('category')


dataModel = pd.concat([myData[['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']],
                       pd.get_dummies(myData[nonum_feats_names].astype('category')),myData['Revenue']],axis=1)
                      
dataModel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 57 columns):
Administrative                   12330 non-null int64
Administrative_Duration          12330 non-null float64
Informational                    12330 non-null int64
Informational_Duration           12330 non-null float64
ProductRelated                   12330 non-null int64
ProductRelated_Duration          12330 non-null float64
BounceRates                      12330 non-null float64
ExitRates                        12330 non-null float64
PageValues                       12330 non-null float64
TrafficType                      12330 non-null int64
SpecialDay                       12330 non-null float64
Month_Aug                        12330 non-null uint8
Month_Dec                        12330 non-null uint8
Month_Feb                        12330 non-null uint8
Month_Jul                        12330 non-null uint8
Month_June                       12330 non-null uint8
Month_Mar    

In [3]:
#Label Encoding of revenue

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
myData['Revenue'] = le.fit_transform(myData['Revenue'])
myData['Revenue'].value_counts()

myData['Revenue'].head()

0    0
1    0
2    0
3    0
4    0
Name: Revenue, dtype: int64

In [4]:
# getting dependent and independent variables

x = dataModel
# removing the target column revenue from x
x = x.drop(['Revenue'], axis = 1)

y = myData['Revenue']

# checking the shapes
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

Shape of x: (12330, 56)
Shape of y: (12330,)


In [5]:
# splitting the data

from sklearn.model_selection import train_test_split

x_baseTrain, x_baseTest, y_baseTrain, y_baseTest = train_test_split(x, y, test_size = 0.3, random_state = 42)

# checking the shapes

print("Shape of x_train :", x_baseTrain.shape)
print("Shape of y_train :", y_baseTrain.shape)
print("Shape of x_test :", x_baseTest.shape)
print("Shape of y_test :", y_baseTest.shape)

Shape of x_train : (8631, 56)
Shape of y_train : (8631,)
Shape of x_test : (3699, 56)
Shape of y_test : (3699,)


In [6]:
#SCALING WITH STANDARD Z SCORE SCALER
#Scaling the data first fitting it and transforming the training set
#to later apply the fit to transform the test set.
from sklearn.preprocessing import StandardScaler

num_cols_names = ['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']

#Instantiate Satandard Scaler
scaler = StandardScaler()
#Fit transform the numerical features in the training dataset to a new dataframe
scaled_numfeats_train = pd.DataFrame(scaler.fit_transform(x_baseTrain[num_cols_names]), 
                                     columns=num_cols_names, index= x_baseTrain.index)
#Integrate scaled values to the training set
for col in num_cols_names:
    x_baseTrain[col] = scaled_numfeats_train[col]
    
    
#Transform the numerical features inthe training dataset to a new dataframe
scaled_numfeats_test = pd.DataFrame(scaler.transform(x_baseTest[num_cols_names]),
                                    columns=num_cols_names, index= x_baseTest.index)
#Integrate scaled values to the test set
for col in num_cols_names:
    x_baseTest[col] = scaled_numfeats_test[col]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
pd.set_option('display.max_columns', 60)
x_baseTest.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,TrafficType,SpecialDay,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_1,OperatingSystems_2,OperatingSystems_3,OperatingSystems_4,OperatingSystems_5,OperatingSystems_6,OperatingSystems_7,OperatingSystems_8,Browser_1,Browser_2,Browser_3,Browser_4,Browser_5,Browser_6,Browser_7,Browser_8,Browser_9,Browser_10,Browser_11,Browser_12,Browser_13,Region_1,Region_2,Region_3,Region_4,Region_5,Region_6,Region_7,Region_8,Region_9,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_False,Weekend_True
8916,0.211602,0.359573,-0.397052,-0.252949,0.366979,-0.079307,-0.361858,-0.615653,-0.314838,1.721701,-0.3047,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
772,1.123404,2.069214,1.16456,1.483566,1.155859,0.744399,-0.407457,-0.786715,-0.204095,-0.515315,-0.3047,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
12250,-0.396266,-0.228151,-0.397052,-0.252949,2.125054,1.769261,-0.439481,-0.620297,-0.131644,-0.515315,-0.3047,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
7793,-0.092332,0.350877,-0.397052,-0.252949,-0.489519,-0.332151,-0.277332,-0.334763,1.631839,-0.0182,-0.3047,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
6601,4.770613,3.059134,4.287783,5.15675,3.07171,2.131509,-0.31342,-0.605466,0.223989,-0.763872,-0.3047,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1


In [8]:
# MODELLING ____ LOGISTIC REGRESSION

model = LogisticRegression()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)
#plt.rcParams['figure.figsize'] = (6, 6)
#sns.heatmap(cm ,annot = True)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.8851813231375275
Testing Accuracy : 0.8821303054879697
ROC AUC Score : 0.6669879752825253
[[3059   65]
 [ 371  204]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3124
           1       0.76      0.35      0.48       575

    accuracy                           0.88      3699
   macro avg       0.83      0.67      0.71      3699
weighted avg       0.87      0.88      0.86      3699





{'fit_time': array([0.0508647 , 0.04986811, 0.05086279, 0.04687428, 0.0498693 ,
       0.04886866, 0.04687405, 0.04587746, 0.04986715, 0.04787302]), 'score_time': array([0.00598288, 0.00797749, 0.00698161, 0.0069809 , 0.00598192,
       0.00698209, 0.00598431, 0.00797844, 0.00598431, 0.00598335]), 'test_accuracy': array([0.87962963, 0.86805556, 0.88310185, 0.88876014, 0.89455388,
       0.88991889, 0.87369641, 0.89339513, 0.88515081, 0.88283063]), 'test_precision': array([0.734375  , 0.67857143, 0.76190476, 0.76056338, 0.78378378,
       0.75675676, 0.66216216, 0.75308642, 0.77419355, 0.72222222]), 'test_recall': array([0.35074627, 0.28358209, 0.35820896, 0.40601504, 0.43609023,
       0.42105263, 0.36842105, 0.45864662, 0.36090226, 0.39097744]), 'test_f1_score': array([0.47474747, 0.4       , 0.48730964, 0.52941176, 0.56038647,
       0.5410628 , 0.47342995, 0.57009346, 0.49230769, 0.50731707])}
Mean Accuracy : 0.8839092925146556
Mean Standard Deviation : 0.008001567066819103
Mean pre

In [9]:
# MODELLING ____RANDOM FOREST

model = RandomForestClassifier()  #can try estimators=200,mex depth=30
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))



Training Accuracy : 0.9932800370756575
Testing Accuracy : 0.8972695323060287
ROC AUC Score : 0.7305834214774815
[[3038   86]
 [ 294  281]]
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      3124
           1       0.77      0.49      0.60       575

    accuracy                           0.90      3699
   macro avg       0.84      0.73      0.77      3699
weighted avg       0.89      0.90      0.89      3699

{'fit_time': array([0.08781338, 0.07878113, 0.07480049, 0.07580185, 0.08375573,
       0.09374881, 0.08981419, 0.07773852, 0.07978129, 0.07180548]), 'score_time': array([0.01595712, 0.01396489, 0.01396179, 0.01396132, 0.0189395 ,
       0.0189147 , 0.0159564 , 0.01600409, 0.01396394, 0.01396394]), 'test_accuracy': array([0.89351852, 0.87615741, 0.90277778, 0.89918888, 0.88760139,
       0.89918888, 0.88876014, 0.89571263, 0.89327146, 0.89443155]), 'test_precision': array([0.73863636, 0.65517241, 0.79761905, 0.76136364, 0.6956521

In [10]:
# MODELLING ____DECISION TREE

model = DecisionTreeClassifier()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 1.0
Testing Accuracy : 0.8637469586374696
ROC AUC Score : 0.7348605466792852
[[2880  244]
 [ 260  315]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      3124
           1       0.56      0.55      0.56       575

    accuracy                           0.86      3699
   macro avg       0.74      0.73      0.74      3699
weighted avg       0.86      0.86      0.86      3699

{'fit_time': array([0.0548532 , 0.05983949, 0.05886984, 0.05684805, 0.05587554,
       0.05884266, 0.05684829, 0.05983973, 0.05984044, 0.05984116]), 'score_time': array([0.00598478, 0.00698137, 0.00698066, 0.00598383, 0.00595856,
       0.00598383, 0.00598359, 0.00598383, 0.00598335, 0.00598836]), 'test_accuracy': array([0.86921296, 0.85069444, 0.86458333, 0.87369641, 0.86442642,
       0.86558517, 0.85515643, 0.82502897, 0.86426914, 0.85266821]), 'test_precision': array([0.57777778, 0.51824818, 0.56589147, 0.59090909, 0.55797101,
       0.562

In [11]:
# MODELLING ____NAIVE BAYES

model = GaussianNB()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.35650561927934193
Testing Accuracy : 0.3646931603135983
ROC AUC Score : 0.6040132494572177
[[ 802 2322]
 [  28  547]]
              precision    recall  f1-score   support

           0       0.97      0.26      0.41      3124
           1       0.19      0.95      0.32       575

    accuracy                           0.36      3699
   macro avg       0.58      0.60      0.36      3699
weighted avg       0.85      0.36      0.39      3699

{'fit_time': array([0.01495981, 0.01595759, 0.01595759, 0.01495934, 0.01595759,
       0.01595712, 0.01496005, 0.01695395, 0.01495981, 0.01595712]), 'score_time': array([0.00797868, 0.00797868, 0.00698233, 0.00797963, 0.00698137,
       0.00698113, 0.00797939, 0.00698185, 0.00797844, 0.00797892]), 'test_accuracy': array([0.36342593, 0.30787037, 0.37847222, 0.4032445 , 0.35341831,
       0.35225956, 0.34762457, 0.33835458, 0.28422274, 0.36542923]), 'test_precision': array([0.19047619, 0.17597765, 0.19236641, 0.19391026, 0.189781

In [12]:
# MODELLING ____KNN

model = KNeighborsClassifier()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))


# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.9027922604564941
Testing Accuracy : 0.8694241686942417
ROC AUC Score : 0.6644321661192452
[[3005  119]
 [ 364  211]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.93      3124
           1       0.64      0.37      0.47       575

    accuracy                           0.87      3699
   macro avg       0.77      0.66      0.70      3699
weighted avg       0.85      0.87      0.85      3699

{'fit_time': array([0.10471988, 0.0679183 , 0.06983972, 0.09081531, 0.07878876,
       0.09275293, 0.07083797, 0.10173941, 0.08579707, 0.08580589]), 'score_time': array([2.2500124 , 2.17783475, 2.05049086, 2.10078144, 1.85304427,
       2.28884196, 1.75929356, 2.21606183, 2.22007656, 2.06843305]), 'test_accuracy': array([0.87152778, 0.86689815, 0.875     , 0.88760139, 0.8783314 ,
       0.89339513, 0.85979143, 0.8829664 , 0.87935035, 0.8712297 ]), 'test_precision': array([0.66666667, 0.63768116, 0.68055556, 0.73684211, 0.6707317

In [13]:
# MODELLING ____Linear SVC

model = LinearSVC()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))



Training Accuracy : 0.8820530645348164
Testing Accuracy : 0.8788861854555285
ROC AUC Score : 0.6473295106608027
[[3072   52]
 [ 396  179]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3124
           1       0.77      0.31      0.44       575

    accuracy                           0.88      3699
   macro avg       0.83      0.65      0.69      3699
weighted avg       0.87      0.88      0.86      3699





{'fit_time': array([0.66422391, 0.63928986, 0.63755202, 0.65570617, 0.8178134 ,
       0.67322683, 0.65823889, 0.65524769, 0.64628434, 0.65526295]), 'score_time': array([0.00698161, 0.00698161, 0.        , 0.0069809 , 0.00698018,
       0.00695467, 0.00797844, 0.00698137, 0.00597191, 0.00696588]), 'test_accuracy': array([0.87847222, 0.86689815, 0.875     , 0.88412514, 0.89107764,
       0.88412514, 0.87137891, 0.89223638, 0.88399072, 0.887471  ]), 'test_precision': array([0.76363636, 0.71111111, 0.74074074, 0.81132075, 0.81967213,
       0.7704918 , 0.66666667, 0.77777778, 0.81132075, 0.8       ]), 'test_recall': array([0.31343284, 0.23880597, 0.29850746, 0.32330827, 0.37593985,
       0.35338346, 0.33082707, 0.42105263, 0.32330827, 0.36090226]), 'test_f1_score': array([0.44444444, 0.3575419 , 0.42553191, 0.46236559, 0.51546392,
       0.48453608, 0.44221106, 0.54634146, 0.46236559, 0.49740933])}
Mean Accuracy : 0.8814775308628698
Mean Standard Deviation : 0.007935384374056097
Mean pre



In [14]:
# MODELLING ____Ada Boost Classifier

model = AdaBoostClassifier()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.8983895261267524
Testing Accuracy : 0.8888888888888888
ROC AUC Score : 0.750454823804487
[[2972  152]
 [ 259  316]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      3124
           1       0.68      0.55      0.61       575

    accuracy                           0.89      3699
   macro avg       0.80      0.75      0.77      3699
weighted avg       0.88      0.89      0.88      3699

{'fit_time': array([0.52459669, 0.49662209, 0.50365257, 0.4886694 , 0.49165893,
       0.48473763, 0.49667144, 0.48766875, 0.49171066, 0.48966432]), 'score_time': array([0.05789495, 0.05186176, 0.05388212, 0.0538795 , 0.05285907,
       0.05384898, 0.05385542, 0.05186176, 0.05385685, 0.05289793]), 'test_accuracy': array([0.90046296, 0.87384259, 0.88657407, 0.90382387, 0.91772885,
       0.89455388, 0.8783314 , 0.88412514, 0.90023202, 0.88863109]), 'test_precision': array([0.73076923, 0.62626263, 0.66363636, 0.73148148, 0.76271186

In [15]:
# MODELLING ____XGBOOST classifier

model = XGBClassifier()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.9210983663538408
Testing Accuracy : 0.8991619356582861
ROC AUC Score : 0.7785317040583422
[[2979  145]
 [ 228  347]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3124
           1       0.71      0.60      0.65       575

    accuracy                           0.90      3699
   macro avg       0.82      0.78      0.80      3699
weighted avg       0.89      0.90      0.90      3699

{'fit_time': array([0.82878375, 0.8188417 , 0.80584502, 0.82681513, 0.82878351,
       0.82080436, 0.81182957, 0.81684232, 0.81285262, 0.81781292]), 'score_time': array([0.01396298, 0.01395726, 0.01296496, 0.01396251, 0.01396298,
       0.01396298, 0.01396251, 0.01496005, 0.01596117, 0.01396298]), 'test_accuracy': array([0.91203704, 0.8900463 , 0.90972222, 0.92468134, 0.9212051 ,
       0.91425261, 0.89918888, 0.91193511, 0.90603248, 0.90487239]), 'test_precision': array([0.75892857, 0.68224299, 0.75      , 0.80357143, 0.7876106

In [16]:
# MODELLING ____gradient boosting Classifier

model = GradientBoostingClassifier()
model.fit(x_baseTrain, y_baseTrain)

y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, y_basePred)
print(cr)


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = model, X = x_baseTrain, y = y_baseTrain, cv = 10,scoring=scoring)
print(cvs)
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Mean Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Mean Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Mean Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Mean Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

Training Accuracy : 0.9230680106592515
Testing Accuracy : 0.894566098945661
ROC AUC Score : 0.7694252073707064
[[2971  153]
 [ 237  338]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3124
           1       0.69      0.59      0.63       575

    accuracy                           0.89      3699
   macro avg       0.81      0.77      0.79      3699
weighted avg       0.89      0.89      0.89      3699

{'fit_time': array([1.16888905, 1.09604263, 1.135988  , 1.05318379, 1.23768973,
       1.10102868, 1.10113382, 1.22969556, 1.09910202, 1.08110523]), 'score_time': array([0.0109551 , 0.00997305, 0.01097035, 0.01097035, 0.01097083,
       0.01099777, 0.01196623, 0.0109601 , 0.01097012, 0.01097751]), 'test_accuracy': array([0.90856481, 0.88194444, 0.90625   , 0.9188876 , 0.91425261,
       0.90845886, 0.89571263, 0.90961761, 0.90603248, 0.89791183]), 'test_precision': array([0.74774775, 0.64545455, 0.74311927, 0.79439252, 0.76576577