In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings

In [48]:
pd.set_option('display.max_columns', 60)
myData=pd.read_csv('imputedData.csv')
myData.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [49]:
#One hot encoding of categorical variables

#Create list with features to be dummified cols.
nonum_feats_names = ['Month','OperatingSystems','Browser','Region','VisitorType','Weekend']


dataModel = pd.concat([myData[['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']],
                       pd.get_dummies(myData[nonum_feats_names].astype('category')),myData['Revenue']],axis=1)
                      
dataModel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 57 columns):
Administrative                   12330 non-null int64
Administrative_Duration          12330 non-null float64
Informational                    12330 non-null int64
Informational_Duration           12330 non-null float64
ProductRelated                   12330 non-null int64
ProductRelated_Duration          12330 non-null float64
BounceRates                      12330 non-null float64
ExitRates                        12330 non-null float64
PageValues                       12330 non-null float64
TrafficType                      12330 non-null int64
SpecialDay                       12330 non-null float64
Month_Aug                        12330 non-null uint8
Month_Dec                        12330 non-null uint8
Month_Feb                        12330 non-null uint8
Month_Jul                        12330 non-null uint8
Month_June                       12330 non-null uint8
Month_Mar    

In [50]:
#Label Encoding of revenue

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
myData['Revenue'] = le.fit_transform(myData['Revenue'])
myData['Revenue'].value_counts()

myData['Revenue'].head()

# getting dependent and independent variables

x = dataModel
# removing the target column revenue from x
x = x.drop(['Revenue'], axis = 1)

y = myData['Revenue']

# checking the shapes
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

Shape of x: (12330, 56)
Shape of y: (12330,)


In [51]:

df_majority=dataModel[dataModel.Revenue==0] ## all rows where Revenue==0
df_minority=dataModel[dataModel.Revenue==1] ## all rows where Revenue==1

df_minority_upsampled=resample(df_minority,replace=True,n_samples=10422)
df_upsampled=pd.concat([df_minority_upsampled,df_majority])

df_upsampled.info()
print(df_upsampled['Revenue'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20844 entries, 10385 to 12329
Data columns (total 57 columns):
Administrative                   20844 non-null int64
Administrative_Duration          20844 non-null float64
Informational                    20844 non-null int64
Informational_Duration           20844 non-null float64
ProductRelated                   20844 non-null int64
ProductRelated_Duration          20844 non-null float64
BounceRates                      20844 non-null float64
ExitRates                        20844 non-null float64
PageValues                       20844 non-null float64
TrafficType                      20844 non-null int64
SpecialDay                       20844 non-null float64
Month_Aug                        20844 non-null uint8
Month_Dec                        20844 non-null uint8
Month_Feb                        20844 non-null uint8
Month_Jul                        20844 non-null uint8
Month_June                       20844 non-null uint8
Month_Mar

In [52]:
X_upsampled = df_upsampled.drop('Revenue',1) ## This is the dependent variable

le1 = LabelEncoder()
df_upsampled['Revenue'] = le1.fit_transform(df_upsampled['Revenue'])
df_upsampled['Revenue'].value_counts()

y_upsampled = df_upsampled['Revenue']

# checking the shapes
print("Shape of x upsampled:", X_upsampled.shape)
print("Shape of y upsampled:", y_upsampled.shape)

Shape of x upsampled: (20844, 56)
Shape of y upsampled: (20844,)


In [54]:
# splitting the data

from sklearn.model_selection import train_test_split

x_baseTrain, x_baseTest, y_baseTrain, y_baseTest = train_test_split(x, y, test_size = 0.3, random_state = 42)
x_baseTrain_upsampled, x_baseTest_upsampled, y_baseTrain_upsampled, y_baseTest_upsampled = train_test_split(X_upsampled, y_upsampled, test_size = 0.3, random_state = 42) 

# checking the shapes

print("Shape of x_train :", x_baseTrain.shape)
print("Shape of y_train :", y_baseTrain.shape)
print("Shape of x_test :", x_baseTest.shape)
print("Shape of y_test :", y_baseTest.shape)

print("Shape of x_train upsampled :", x_baseTrain_upsampled.shape)
print("Shape of y_train upsampled :", y_baseTrain_upsampled.shape)
print("Shape of x_test upsampled :", x_baseTest_upsampled.shape)
print("Shape of y_test upsampled :", y_baseTest_upsampled.shape)

Shape of x_train : (8631, 56)
Shape of y_train : (8631,)
Shape of x_test : (3699, 56)
Shape of y_test : (3699,)
Shape of x_train upsampled : (14590, 56)
Shape of y_train upsampled : (14590,)
Shape of x_test upsampled : (6254, 56)
Shape of y_test upsampled : (6254,)


In [55]:
#SCALING WITH STANDARD Z SCORE SCALER
#Scaling the data first fitting it and transforming the training set
#to later apply the fit to transform the test set.
from sklearn.preprocessing import StandardScaler

num_cols_names = ['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']

#Instantiate Satandard Scaler
scaler = StandardScaler()
#Fit transform the numerical features in the training dataset to a new dataframe
scaled_numfeats_train = pd.DataFrame(scaler.fit_transform(x_baseTrain[num_cols_names]), 
                                     columns=num_cols_names, index= x_baseTrain.index)
#Integrate scaled values to the training set
for col in num_cols_names:
    x_baseTrain[col] = scaled_numfeats_train[col]
    
    
#Transform the numerical features inthe training dataset to a new dataframe
scaled_numfeats_test = pd.DataFrame(scaler.transform(x_baseTest[num_cols_names]),
                                    columns=num_cols_names, index= x_baseTest.index)
#Integrate scaled values to the test set
for col in num_cols_names:
    x_baseTest[col] = scaled_numfeats_test[col]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [56]:
#Scaling same as above for Upsampled data
from sklearn.preprocessing import StandardScaler

num_cols_names = ['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']

#Instantiate Satandard Scaler
scaler = StandardScaler()
#Fit transform the numerical features in the training dataset to a new dataframe
scaled_numfeats_train = pd.DataFrame(scaler.fit_transform(x_baseTrain_upsampled[num_cols_names]), 
                                     columns=num_cols_names, index= x_baseTrain_upsampled.index)
#Integrate scaled values to the training set
for col in num_cols_names:
    x_baseTrain_upsampled[col] = scaled_numfeats_train[col]
    
    
#Transform the numerical features inthe training dataset to a new dataframe
scaled_numfeats_test = pd.DataFrame(scaler.transform(x_baseTest_upsampled[num_cols_names]),
                                    columns=num_cols_names, index= x_baseTest_upsampled.index)
#Integrate scaled values to the test set
for col in num_cols_names:
    x_baseTest_upsampled[col] = scaled_numfeats_test[col]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [57]:
x_baseTrain.drop(labels=['Browser_1','BounceRates','ProductRelated','VisitorType_Returning_Visitor'], 
                 axis=1, inplace=True)
x_baseTest.drop(labels=['Browser_1','BounceRates','ProductRelated','VisitorType_Returning_Visitor'], 
                axis=1, inplace=True)

x_baseTrain_upsampled.drop(labels=['Browser_1','BounceRates','ProductRelated','VisitorType_Returning_Visitor'], 
                 axis=1, inplace=True)
x_baseTest_upsampled.drop(labels=['Browser_1','BounceRates','ProductRelated','VisitorType_Returning_Visitor'], 
                axis=1, inplace=True)

x_baseTrain_upsampled.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated_Duration,ExitRates,PageValues,TrafficType,SpecialDay,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_1,OperatingSystems_2,OperatingSystems_3,OperatingSystems_4,OperatingSystems_5,OperatingSystems_6,OperatingSystems_7,OperatingSystems_8,Browser_2,Browser_3,Browser_4,Browser_5,Browser_6,Browser_7,Browser_8,Browser_9,Browser_10,Browser_11,Browser_12,Browser_13,Region_1,Region_2,Region_3,Region_4,Region_5,Region_6,Region_7,Region_8,Region_9,VisitorType_New_Visitor,VisitorType_Other,Weekend_False,Weekend_True
5560,-0.780725,-0.512219,-0.443854,-0.283636,-0.278072,-0.744065,-0.21388,-0.513765,-0.263798,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
10039,-0.780725,-0.512219,0.984427,-0.047203,-0.033733,-0.53946,-0.50683,-0.513765,-0.263798,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
8562,-0.496687,-0.512219,-0.443854,-0.283636,-0.324719,1.886639,-0.50683,0.482557,-0.263798,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
10766,0.071389,0.691343,-0.443854,-0.283636,-0.704603,-0.006484,-0.50683,0.482557,-0.263798,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
6294,-0.780725,-0.512219,-0.443854,-0.283636,-0.509034,1.697327,-0.50683,-0.264684,-0.263798,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [58]:
X_train = x_baseTrain.drop(['Weekend_True','Region_2','Region_1','Month_Jul','Month_Feb','Informational','Browser_9','Browser_12',
                        'Browser_11','Region_8','Region_3','OperatingSystems_6','OperatingSystems_1','Month_June',
                        'Browser_8','Browser_7','Browser_3','Browser_13','Browser_10','VisitorType_Other','Region_8','Region_7',
                        'Region_5','OperatingSystems_5','Month_Aug'],axis=1)
X_test = x_baseTest.drop(['Weekend_True','Region_2','Region_1','Month_Jul','Month_Feb','Informational','Browser_9','Browser_12',
                        'Browser_11','Region_8','Region_3','OperatingSystems_6','OperatingSystems_1','Month_June',
                        'Browser_8','Browser_7','Browser_3','Browser_13','Browser_10','VisitorType_Other','Region_8','Region_7',
                        'Region_5','OperatingSystems_5','Month_Aug'],axis=1)

print(X_train.shape, X_test.shape)



X_train_upsampled = x_baseTrain_upsampled.drop(['Weekend_True','Region_2','Region_1','Month_Jul','Month_Feb','Informational','Browser_9','Browser_12',
                        'Browser_11','Region_8','Region_3','OperatingSystems_6','OperatingSystems_1','Month_June',
                        'Browser_8','Browser_7','Browser_3','Browser_13','Browser_10','VisitorType_Other','Region_8','Region_7',
                        'Region_5','OperatingSystems_5','Month_Aug'],axis=1)
X_test_upsampled = x_baseTest_upsampled.drop(['Weekend_True','Region_2','Region_1','Month_Jul','Month_Feb','Informational','Browser_9','Browser_12',
                        'Browser_11','Region_8','Region_3','OperatingSystems_6','OperatingSystems_1','Month_June',
                        'Browser_8','Browser_7','Browser_3','Browser_13','Browser_10','VisitorType_Other','Region_8','Region_7',
                        'Region_5','OperatingSystems_5','Month_Aug'],axis=1)

print(X_train_upsampled,X_test_upsampled)

(8631, 28) (3699, 28)
       Administrative  Administrative_Duration  Informational_Duration  \
5560        -0.780725                -0.512219               -0.283636   
10039       -0.780725                -0.512219               -0.047203   
8562        -0.496687                -0.512219               -0.283636   
10766        0.071389                 0.691343               -0.283636   
6294        -0.780725                -0.512219               -0.283636   
...               ...                      ...                     ...   
934          0.071389                 0.291943               -0.283636   
1688        -0.780725                -0.512219               -0.283636   
5929         0.355427                 0.213671               -0.283636   
2122         0.071389                -0.367470               -0.283636   
6034        -0.780725                -0.512219                0.434516   

       ProductRelated_Duration  ExitRates  PageValues  TrafficType  \
5560               

In [59]:
X_train.head()
X_train_upsampled.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational_Duration,ProductRelated_Duration,ExitRates,PageValues,TrafficType,SpecialDay,Month_Dec,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_2,OperatingSystems_3,OperatingSystems_4,OperatingSystems_7,OperatingSystems_8,Browser_2,Browser_4,Browser_5,Browser_6,Region_4,Region_6,Region_9,VisitorType_New_Visitor,Weekend_False
5560,-0.780725,-0.512219,-0.283636,-0.278072,-0.744065,-0.21388,-0.513765,-0.263798,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
10039,-0.780725,-0.512219,-0.047203,-0.033733,-0.53946,-0.50683,-0.513765,-0.263798,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8562,-0.496687,-0.512219,-0.283636,-0.324719,1.886639,-0.50683,0.482557,-0.263798,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
10766,0.071389,0.691343,-0.283636,-0.704603,-0.006484,-0.50683,0.482557,-0.263798,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0
6294,-0.780725,-0.512219,-0.283636,-0.509034,1.697327,-0.50683,-0.264684,-0.263798,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0


In [60]:
X_train = X_train.drop(['OperatingSystems_4','OperatingSystems_7','OperatingSystems_8','Browser_5','Region_9'],axis=1)
X_test = X_test.drop(['OperatingSystems_4','OperatingSystems_7','OperatingSystems_8','Browser_5','Region_9'],axis=1)

print(X_train.shape, X_test.shape)

X_train_upsampled = X_train_upsampled.drop(['OperatingSystems_4','OperatingSystems_7','OperatingSystems_8','Browser_5','Region_9'],axis=1)
X_test_upsampled = X_test_upsampled.drop(['OperatingSystems_4','OperatingSystems_7','OperatingSystems_8','Browser_5','Region_9'],axis=1)

print(X_train_upsampled,X_test_upsampled)


(8631, 23) (3699, 23)
       Administrative  Administrative_Duration  Informational_Duration  \
5560        -0.780725                -0.512219               -0.283636   
10039       -0.780725                -0.512219               -0.047203   
8562        -0.496687                -0.512219               -0.283636   
10766        0.071389                 0.691343               -0.283636   
6294        -0.780725                -0.512219               -0.283636   
...               ...                      ...                     ...   
934          0.071389                 0.291943               -0.283636   
1688        -0.780725                -0.512219               -0.283636   
5929         0.355427                 0.213671               -0.283636   
2122         0.071389                -0.367470               -0.283636   
6034        -0.780725                -0.512219                0.434516   

       ProductRelated_Duration  ExitRates  PageValues  TrafficType  \
5560               

In [61]:
X_train.head()
X_test.head()
X_train_upsampled.head()
X_test_upsampled.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational_Duration,ProductRelated_Duration,ExitRates,PageValues,TrafficType,SpecialDay,Month_Dec,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_2,OperatingSystems_3,Browser_2,Browser_4,Browser_6,Region_4,Region_6,VisitorType_New_Visitor,Weekend_False
358,0.071389,-0.292415,-0.283636,0.64151,-0.6826,-0.50683,-0.513765,-0.263798,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1
2544,-0.780725,-0.512219,-0.283636,0.02427,2.265264,-0.50683,-0.015604,-0.263798,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1
3853,0.639465,-0.050273,-0.199427,0.046764,-0.351486,-0.189019,-0.513765,-0.263798,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
9741,-0.780725,-0.512219,-0.283636,-0.27373,-0.543216,0.171582,-0.762845,-0.263798,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
6093,-0.780725,-0.512219,-0.283636,-0.575981,1.007689,-0.50683,-0.762845,-0.263798,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1


# Model Building with Hyper parameter tuning

# Logistic Regression : Model Building and Validation

In [62]:
from sklearn.linear_model import LogisticRegression

# Build a model with Base Algorithm and Original Data (Containing Class Imbalance)

In [63]:
classifier_wo = LogisticRegression() 
classifier_wo.fit(X_train,y_baseTrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Confusion Matrix of Original Data and Cross Validation

In [73]:
warnings.filterwarnings("ignore")
#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)


Training Accuracy : 0.8852971845672576
Testing Accuracy : 0.8807785888077859
ROC AUC Score : 0.6654782051995769
******************************************************
CONFUSION MATRIX
[[3055   69]
 [ 372  203]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3124
           1       0.75      0.35      0.48       575

    accuracy                           0.88      3699
   macro avg       0.82      0.67      0.71      3699
weighted avg       0.87      0.88      0.86      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8848365602070792
Standard Deviation : 0.009070549134597861
Mean precision score : 0.7456834105695713
Standard Deviation precision score : 0.04275105678498037
Mean recall score : 0.3842273594433846
Standard Deviation recall score : 0.05415403190529826
Mean f1 score : 0

In [74]:
# AUC is high. Since Std Dev AUC ROC Score: 0.010, model may overfit.

# With upsampled data

In [80]:
classifier_up = LogisticRegression() 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Confusion Matrix of Upsampled Data and Cross Validation

In [79]:
#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)
warnings.filterwarnings("default")

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.7169505271695052
Testing Accuracy : 0.7172208705055421
ROC AUC Score : 0.8049153816177699
******************************************************
CONFUSION MATRIX
[[2117 1007]
 [  39  536]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.98      0.68      0.80      3124
           1       0.35      0.93      0.51       575

    accuracy                           0.72      3699
   macro avg       0.66      0.80      0.65      3699
weighted avg       0.88      0.72      0.76      3699

******************************************************




CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8148043558720477
Standard Deviation : 0.00951348249278598
Mean precision score : 0.8519383647670129
Standard Deviation precision score : 0.012357093764549713
Mean recall score : 0.7611310972383875
Standard Deviation recall score : 0.018276950744725944
Mean f1 score : 0.8038110143358418
Standard Deviation f1 score : 0.011187013717772667




Mean AUC ROC Score: 0.9047526347522156
Std Dev AUC ROC Score: 0.0038167401521151403


# LG : Model Tuning

In [82]:
from sklearn.model_selection import GridSearchCV

classifier = LogisticRegression(penalty='l2')
params = [
    {
     #'model_gs' : [LogisticRegression(class_weight='balanced')],
     #'C' : [0.001, 0.01, 0.1, 1, 10, 40, 50, 200],
    'C' : [0.1, 1],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
]

classifier = GridSearchCV(classifier, param_grid=params, n_jobs=-1)

#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.1, 1],
                          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                     'saga']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [83]:
#The best hyper parameters set
print("Best Hyper Parameters:",classifier.best_params_)

Best Hyper Parameters: {'C': 1, 'solver': 'newton-cg'}


# Logistic Regression : With Best Hyperparameters (original data)

In [84]:
classifier_wo = LogisticRegression(C=1.0,solver='newton-cg') 
classifier_wo.fit(X_train,y_baseTrain)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.8854130459969876
Testing Accuracy : 0.8810489321438226
ROC AUC Score : 0.6663477704169682
******************************************************
CONFUSION MATRIX
[[3055   69]
 [ 371  204]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3124
           1       0.75      0.35      0.48       575

    accuracy                           0.88      3699
   macro avg       0.82      0.67      0.71      3699
weighted avg       0.87      0.88      0.86      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8850681758029764
Standard Deviation : 0.008871893040880279
Mean precision score : 0.7466448922479012
Standard Deviation precision score : 0.04164592816838322
Mean recall score : 0.3857255077993491
Standard Deviation recall score : 0.052827726750092646
Mean f1 score : 

# Logistic Regression : With Best Hyperparameters (Upsampled data)

In [85]:
classifier_up = LogisticRegression(C=1.0,solver='newton-cg') 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.7166029428803151
Testing Accuracy : 0.7172208705055421
ROC AUC Score : 0.8049153816177699
******************************************************
CONFUSION MATRIX
[[2117 1007]
 [  39  536]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.98      0.68      0.80      3124
           1       0.35      0.93      0.51       575

    accuracy                           0.72      3699
   macro avg       0.66      0.80      0.65      3699
weighted avg       0.88      0.72      0.76      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8150100701792523
Standard Deviation : 0.009450574296018805
Mean precision score : 0.8522223449860243
Standard Deviation precision score : 0.012394443746953991
Mean recall score : 0.7612686488202306
Standard Deviation recall score : 0.01795465052093721
Mean f1 score : 

# Naive Bayes

# With original data

In [86]:
from sklearn.naive_bayes import GaussianNB

classifier_wo = GaussianNB() 
classifier_wo.fit(X_train,y_baseTrain)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.8112617309697602
Testing Accuracy : 0.8031900513652338
ROC AUC Score : 0.7309372042531871
******************************************************
CONFUSION MATRIX
[[2611  513]
 [ 215  360]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.92      0.84      0.88      3124
           1       0.41      0.63      0.50       575

    accuracy                           0.80      3699
   macro avg       0.67      0.73      0.69      3699
weighted avg       0.84      0.80      0.82      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.809754304817584
Standard Deviation : 0.011463008426489911
Mean precision score : 0.4230661395376457
Standard Deviation precision score : 0.024176951758353098
Mean recall score : 0.6370104365391089
Standard Deviation recall score : 0.052261460990708164
Mean f1 score : 

# With upsampled data

In [87]:
classifier_up = GaussianNB() 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)
warnings.filterwarnings("default")

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.7045533541883907
Testing Accuracy : 0.709110570424439
ROC AUC Score : 0.7766998830930247
******************************************************
CONFUSION MATRIX
[[2120 1004]
 [  72  503]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.97      0.68      0.80      3124
           1       0.33      0.87      0.48       575

    accuracy                           0.71      3699
   macro avg       0.65      0.78      0.64      3699
weighted avg       0.87      0.71      0.75      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.7546242036191958
Standard Deviation : 0.00689926986548356
Mean precision score : 0.7446356819036304
Standard Deviation precision score : 0.010920642977614735
Mean recall score : 0.7736388439620903
Standard Deviation recall score : 0.015408031288820706
Mean f1 score : 0

# KNN With original data

In [88]:
from sklearn.neighbors import KNeighborsClassifier

classifier_wo = KNeighborsClassifier() 
classifier_wo.fit(X_train,y_baseTrain)

  return f(*args, **kwds)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [89]:
y_pred=classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
cvs = cross_validate(estimator = classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))

mean_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).mean()
std_roc_auc_score = cross_val_score(estimator = classifier_wo, X = X_train, y = y_baseTrain, scoring="roc_auc", cv = 7).std()

print("Mean AUC ROC Score:",mean_roc_auc_score)
print("Std Dev AUC ROC Score:",std_roc_auc_score)

Training Accuracy : 0.9084694705132661
Testing Accuracy : 0.8751013787510138
ROC AUC Score : 0.6855310916884707
******************************************************
CONFUSION MATRIX
[[3001  123]
 [ 339  236]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      3124
           1       0.66      0.41      0.51       575

    accuracy                           0.88      3699
   macro avg       0.78      0.69      0.72      3699
weighted avg       0.86      0.88      0.86      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8807824164686876
Standard Deviation : 0.01340017748462996
Mean precision score : 0.6871805804123164
Standard Deviation precision score : 0.07090259484661401
Mean recall score : 0.4201829199865335
Standard Deviation recall score : 0.04891935389590114
Mean f1 score : 0.

# KNN with upsampled data

In [95]:
classifier_up = KNeighborsClassifier() 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.7895956436102421
Testing Accuracy : 0.7929170045958367
ROC AUC Score : 0.8539868062127707
******************************************************
CONFUSION MATRIX
[[2391  733]
 [  33  542]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.99      0.77      0.86      3124
           1       0.43      0.94      0.59       575

    accuracy                           0.79      3699
   macro avg       0.71      0.85      0.72      3699
weighted avg       0.90      0.79      0.82      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8466071134025039
Standard Deviation : 0.008500268959792475
Mean precision score : 0.804857042970229
Standard Deviation precision score : 0.00990555855159433
Mean recall score : 0.9142399897214203
Standard Deviation recall score : 0.012054230387079366
Mean f1 score : 0

# KNN HYperparameter tuning

In [100]:
#Default : algorithm='auto', leaf_size=30, metric='minkowski',metric_params=None, n_jobs=None, n_neighbors=5, p=2,weights='uniform'
        
classifier = KNeighborsClassifier()

#Hyper Parameters Set
params = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[30,35,40],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]} ## use entire CPU to execute
#Making models with hyper parameters sets
classifier = GridSearchCV(classifier, param_grid=params, n_jobs=1)
#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set
print("Best Hyper Parameters:\n",classifier.best_params_)



Best Hyper Parameters:
 {'algorithm': 'auto', 'leaf_size': 30, 'n_jobs': -1, 'n_neighbors': 6, 'weights': 'distance'}


# KNN : with Best Hyperparameters (Upsampled data) improve ?


In [101]:
classifier_up = KNeighborsClassifier(algorithm='auto', leaf_size = 30, n_jobs = -1, n_neighbors = 6, weights = 'distance') 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.8267871625535859
Testing Accuracy : 0.8331981616653149
ROC AUC Score : 0.8906056894728053
******************************************************
CONFUSION MATRIX
[[2522  602]
 [  15  560]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      3124
           1       0.48      0.97      0.64       575

    accuracy                           0.83      3699
   macro avg       0.74      0.89      0.77      3699
weighted avg       0.91      0.83      0.85      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8949276642919314
Standard Deviation : 0.008972837893443128
Mean precision score : 0.830979322703777
Standard Deviation precision score : 0.011121910511365662
Mean recall score : 0.9910657602370119
Standard Deviation recall score : 0.004651355711872318
Mean f1 score : 

MAssive improvement in accuracy WOw!!!

# Support Vector Machine original data

In [102]:
from sklearn.svm import SVC

classifier_wo = SVC() 
classifier_wo.fit(X_train,y_baseTrain)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [103]:
y_pred=classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.9014019232997336
Testing Accuracy : 0.8915923222492566
ROC AUC Score : 0.7243842899292992
******************************************************
CONFUSION MATRIX
[[3021  103]
 [ 298  277]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      3124
           1       0.73      0.48      0.58       575

    accuracy                           0.89      3699
   macro avg       0.82      0.72      0.76      3699
weighted avg       0.88      0.89      0.88      3699

******************************************************




CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.895843338393212
Standard Deviation : 0.008855928326330366
Mean precision score : 0.748901465144362
Standard Deviation precision score : 0.05456076026848492
Mean recall score : 0.49367074402423966
Standard Deviation recall score : 0.0303864846566098
Mean f1 score : 0.594175253210864
Standard Deviation f1 score : 0.03229001170528599
Mean AUC ROC Score : 0.7314895238808835
Standard Deviation AUC ROC Score : 0.016074295501804992


# SVM with upsampled data

In [104]:
classifier_up = SVC() 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))



Training Accuracy : 0.8167072181670721
Testing Accuracy : 0.8194106515274399
ROC AUC Score : 0.8540625173968713
******************************************************
CONFUSION MATRIX
[[2511  613]
 [  55  520]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.98      0.80      0.88      3124
           1       0.46      0.90      0.61       575

    accuracy                           0.82      3699
   macro avg       0.72      0.85      0.75      3699
weighted avg       0.90      0.82      0.84      3699

******************************************************




CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.8360518884601479
Standard Deviation : 0.01000285319016244
Mean precision score : 0.8807832013550698
Standard Deviation precision score : 0.016703942127305263
Mean recall score : 0.7768027192889642
Standard Deviation recall score : 0.018340283934152705
Mean f1 score : 0.8253006625875475
Standard Deviation f1 score : 0.01103858835700299
Mean AUC ROC Score : 0.8358980518020924
Standard Deviation AUC ROC Score : 0.010001059850461489


# SVM Hyper parameter tuning

In [105]:
#Default:C=1.0, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape='ovr', degree=3, gamma='auto_deprecated',    kernel='rbf', max_iter=-1, probability=False, random_state=None,shrinking=True, tol=0.001, verbose=False

        
classifier = SVC()

#Hyper Parameters Set
params = {'C': [0.1,1,5,7,8,9,10, 100, 1000], 'gamma': [1,0.1,0.01,0.002,0.0005,0.001,0.0001], 
          'kernel': ['rbf']}
#Making models with hyper parameters sets
classifier = GridSearchCV(classifier, param_grid=params, n_jobs=1)
#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set
print("Best Hyper Parameters:\n",classifier.best_params_)
print(classifier.best_estimator_)



Best Hyper Parameters:
 {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


# SVM : with Best Hyperparameters (Upsampled data) improve ?

In [106]:
classifier_up = SVC(C=1000, gamma=1,kernel='rbf') 
classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

#classifier.fit(X_train_upsampled,y_train_upsampled)
y_pred=classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy Logistic Regression:", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.8698876144131619
Testing Accuracy : 0.8664503919978372
ROC AUC Score : 0.8457262149974949
******************************************************
CONFUSION MATRIX
[[2736  388]
 [ 106  469]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.96      0.88      0.92      3124
           1       0.55      0.82      0.66       575

    accuracy                           0.87      3699
   macro avg       0.75      0.85      0.79      3699
weighted avg       0.90      0.87      0.88      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy Logistic Regression: 0.946881639817771
Standard Deviation : 0.0068940065867908835
Mean precision score : 0.9162688515721484
Standard Deviation precision score : 0.007154853307981309
Mean recall score : 0.9833685399882098
Standard Deviation recall score : 0.007288728514521724
Mean f1 score :

# Ensemble, Bagging and Boosting algorithms

# AdaBoost Classifier original data

In [114]:
from sklearn. ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

adb_classifier_wo = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 3, learning_rate = 0.001)
adb_classifier_wo.fit(X_train,y_baseTrain)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                          

In [115]:
y_pred=adb_classifier_wo.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", adb_classifier_wo.score(X_train, y_baseTrain))
print("Testing Accuracy :", adb_classifier_wo.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = adb_classifier_wo, X = X_train, y = y_baseTrain, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 1.0
Testing Accuracy : 0.8556366585563666
ROC AUC Score : 0.7222543561765852
******************************************************
CONFUSION MATRIX
[[2861  263]
 [ 271  304]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      3124
           1       0.54      0.53      0.53       575

    accuracy                           0.86      3699
   macro avg       0.72      0.72      0.72      3699
weighted avg       0.85      0.86      0.86      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.8608515237911879
Standard Deviation : 0.0071848081434893575
Mean precision score : 0.5493895977343644
Standard Deviation precision score : 0.026364367571059623
Mean recall score : 0.5611323083828974
Standard Deviation recall score : 0.03346532500082968
Mean f1 score : 0.5544886066684707
Standard Devia

# Adaboost classifier Upsampled data

In [116]:
adb_classifier_up = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 3, learning_rate = 0.001)
adb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=adb_classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", adb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", adb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = adb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.49959448499594483
Testing Accuracy : 0.5298729386320628
ROC AUC Score : 0.6017630685297556
******************************************************
CONFUSION MATRIX
[[1554 1570]
 [ 169  406]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.90      0.50      0.64      3124
           1       0.21      0.71      0.32       575

    accuracy                           0.53      3699
   macro avg       0.55      0.60      0.48      3699
weighted avg       0.79      0.53      0.59      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9500349559383533
Standard Deviation : 0.006963566607892379
Mean precision score : 0.9162348099119285
Standard Deviation precision score : 0.013420884193971111
Mean recall score : 0.9906529165470019
Standard Deviation recall score : 0.0038804917682468904
Mean f1 score : 0.95191896427658

# Adaboost Hyper parameter tuning

In [131]:
#Default algorithm='SAMME.R',
                   #base_estimator=DecisionTreeClassifier(class_weight=None,criterion='gini',max_depth=None,max_features=None,
                    #                                     max_leaf_nodes=None,min_impurity_decrease=0.0,min_impurity_split=None,
                    #                                    min_samples_leaf=1,min_samples_split=2,
                    #                                   min_weight_fraction_leaf=0.0,presort=False,random_state=None,
                    #                                  splitter='best'),learning_rate=0.001, n_estimators=3, random_state=None
                    
                    
classifier = AdaBoostClassifier()

#Hyper Parameters Set
params = {'n_estimators': [3,5,10,25,50,100,200,500],
          'learning_rate': [1,0.1,0.01,0.001],
          'base_estimator':[DecisionTreeClassifier(max_depth=1)]
         }
#Making models with hyper parameters sets
classifier = GridSearchCV(classifier, param_grid=params, n_jobs=-1)
#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set
print("Best Hyper Parameters:\n",classifier.best_params_)



Best Hyper Parameters:
 {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best'), 'learning_rate': 1, 'n_estimators': 500}


# Ada boost with upsampled data Hyperparameters Improve ?

In [132]:
adb_classifier_up = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best'), learning_rate = 1, n_estimators = 500)

adb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=adb_classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", adb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", adb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = adb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.3556945892712316
Testing Accuracy : 0.3833468505001352
ROC AUC Score : 0.6235709514001002
******************************************************
CONFUSION MATRIX
[[ 859 2265]
 [  16  559]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.98      0.27      0.43      3124
           1       0.20      0.97      0.33       575

    accuracy                           0.38      3699
   macro avg       0.59      0.62      0.38      3699
weighted avg       0.86      0.38      0.41      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.8572328058372047
Standard Deviation : 0.010601007490407413
Mean precision score : 0.8710758864682063
Standard Deviation precision score : 0.015012587623592848
Mean recall score : 0.8379615913659931
Standard Deviation recall score : 0.01035257313434504
Mean f1 score : 0.8541321751988047


# Gradient boosting classifier upsampled data

In [133]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier_up = GradientBoostingClassifier()
gb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [134]:
y_pred=gb_classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", gb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", gb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = gb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.2427296952844398
Testing Accuracy : 0.25195998918626655
ROC AUC Score : 0.5564287702499582
******************************************************
CONFUSION MATRIX
[[ 358 2766]
 [   1  574]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      0.11      0.21      3124
           1       0.17      1.00      0.29       575

    accuracy                           0.25      3699
   macro avg       0.58      0.56      0.25      3699
weighted avg       0.87      0.25      0.22      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.8687463250514019
Standard Deviation : 0.011102896011624355
Mean precision score : 0.8654231252498675
Standard Deviation precision score : 0.014522138288031046
Mean recall score : 0.872734366733679
Standard Deviation recall score : 0.012540493286439122
Mean f1 score : 0.868987502978196


# Gradient Boosting Hyperparameter tuning

In [135]:
#Defaut : criterion='friedman_mse', init=None,
                           #learning_rate=0.1, loss='deviance', max_depth=3,
                           #max_features=None, max_leaf_nodes=None,
                           #min_impurity_decrease=0.0, min_impurity_split=None,
                           #min_samples_leaf=1, min_samples_split=2,
                           #min_weight_fraction_leaf=0.0, n_estimators=100,
                           #n_iter_no_change=None, presort='auto',
                           #random_state=None, subsample=1.0, tol=0.0001,
                           #validation_fraction=0.1, verbose=0,
                           #warm_start=False          
                    
                    
classifier = GradientBoostingClassifier()

#Hyper Parameters Set
params = {'n_estimators': [100,150,200],
          'learning_rate': [1,0.1,0.01,0.001]
         }
#Making models with hyper parameters sets
classifier = GridSearchCV(classifier, param_grid=params, n_jobs=-1)
#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set
print("Best Hyper Parameters:\n",classifier.best_params_)



Best Hyper Parameters:
 {'learning_rate': 1, 'n_estimators': 200}


# Gradient Boosting with hyperparameters upsampled data Improve ?

In [136]:
gb_classifier_up = GradientBoostingClassifier(n_estimators=200,learning_rate=1)
gb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=gb_classifier_up.predict(X_test)

# evaluating the model
from sklearn.metrics import roc_auc_score
print("Training Accuracy :", gb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", gb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

from sklearn.metrics import classification_report,confusion_matrix
print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate


# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = gb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.391959216776735
Testing Accuracy : 0.3982157339821573
ROC AUC Score : 0.5912219562433891
******************************************************
CONFUSION MATRIX
[[ 972 2152]
 [  74  501]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.93      0.31      0.47      3124
           1       0.19      0.87      0.31       575

    accuracy                           0.40      3699
   macro avg       0.56      0.59      0.39      3699
weighted avg       0.81      0.40      0.44      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9304315880987646
Standard Deviation : 0.0053302015226242795
Mean precision score : 0.8981614308061483
Standard Deviation precision score : 0.009934894896618767
Mean recall score : 0.9707256979609113
Standard Deviation recall score : 0.00539452422644511
Mean f1 score : 0.9329864938438407


# Bagging Tree Classifier with upsampled data

In [138]:
from sklearn import tree
bt_classifier_up = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
bt_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=1,
                                                        splitter='best'),
       

In [139]:
y_pred=bt_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", bt_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", bt_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = bt_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.5625072413393581
Testing Accuracy : 0.5696134090294674
ROC AUC Score : 0.6657328953960919
******************************************************
CONFUSION MATRIX
[[1644 1480]
 [ 112  463]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.94      0.53      0.67      3124
           1       0.24      0.81      0.37       575

    accuracy                           0.57      3699
   macro avg       0.59      0.67      0.52      3699
weighted avg       0.83      0.57      0.63      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9498291944926873
Standard Deviation : 0.006471398435363383
Mean precision score : 0.9170613506608797
Standard Deviation precision score : 0.010983937124609006
Mean recall score : 0.9890041870096891
Standard Deviation recall score : 0.004260057940026731
Mean f1 score : 0.9516329860012496

# Bagging tree hyper parameter tuning

In [140]:
#Defaut : base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        #criterion='gini',
                                                        #max_depth=None,
                                                        #max_features=None,
                                                        #max_leaf_nodes=None,
                                                        #min_impurity_decrease=0.0,
                                                        #min_impurity_split=None,
                                                        #min_samples_leaf=1,
                                                        #min_samples_split=2,
                                                        #min_weight_fraction_leaf=0.0,
                                                        #presort=False,
                                                        #random_state=1,
                                                        #splitter='best'),
                  #bootstrap=True, bootstrap_features=False, max_features=1.0,
                  #max_samples=1.0, n_estimators=10, n_jobs=None,
                  #oob_score=False, random_state=None, verbose=0,
                  #warm_start=False)      
                    
                    
classifier = BaggingClassifier()

#Hyper Parameters Set
params = {'n_estimators': [10,20,50]
         }
#Making models with hyper parameters sets
classifier = GridSearchCV(classifier, param_grid=params, n_jobs=-1)
#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set
print("Best Hyper Parameters:\n",classifier.best_params_)



Best Hyper Parameters:
 {'n_estimators': 50}


# Bagging tree classifier with hyperparameters upsampled data improve?

In [151]:
bt_classifier_up = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1),n_estimators=50)
bt_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=bt_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", bt_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", bt_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = bt_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.48905109489051096
Testing Accuracy : 0.5025682616923492
ROC AUC Score : 0.6657729777876746
******************************************************
CONFUSION MATRIX
[[1340 1784]
 [  56  519]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.96      0.43      0.59      3124
           1       0.23      0.90      0.36       575

    accuracy                           0.50      3699
   macro avg       0.59      0.67      0.48      3699
weighted avg       0.85      0.50      0.56      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.951542086925049
Standard Deviation : 0.007072764967788602
Mean precision score : 0.9169145291799401
Standard Deviation precision score : 0.01183648841170082
Mean recall score : 0.9929899708269723
Standard Deviation recall score : 0.0027122465796855866
Mean f1 score : 0.9533957735380468

# Decision Tree classifier upsampled data

In [142]:
dt_classifier_up = DecisionTreeClassifier()
dt_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [143]:
y_pred=dt_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", dt_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", dt_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = dt_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.5143088865716603
Testing Accuracy : 0.5501486888348203
ROC AUC Score : 0.6130573957579468
******************************************************
CONFUSION MATRIX
[[1630 1494]
 [ 170  405]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.91      0.52      0.66      3124
           1       0.21      0.70      0.33       575

    accuracy                           0.55      3699
   macro avg       0.56      0.61      0.49      3699
weighted avg       0.80      0.55      0.61      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9481155507427961
Standard Deviation : 0.00554469281006839
Mean precision score : 0.9131310702042947
Standard Deviation precision score : 0.011224012778789823
Mean recall score : 0.9903781912722766
Standard Deviation recall score : 0.0034798166336836647
Mean f1 score : 0.9501327113348499

# Decision Tree Hyperparameter tuning 

In [144]:
#making the instance
model= DecisionTreeClassifier(random_state=1234)
#Hyper Parameters Set
params = { 
          'criterion':['gini','entropy'],
          'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'random_state':[123],
          'max_depth':[1,2,3,4,5,6]
        }
#Making models with hyper parameters sets
classifier = GridSearchCV(model, param_grid=params, n_jobs=-1)

#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set

print("Best Hyper Parameters:",classifier.best_params_)



Best Hyper Parameters: {'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 12, 'random_state': 123}


# Decision tree classifier with hyper parameters upsampled data

In [145]:
dt_classifier_up = DecisionTreeClassifier(criterion='gini',max_depth=6,
                                          max_features='auto',min_samples_leaf=5,min_samples_split=12,random_state=123)
dt_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=dt_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", dt_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", dt_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = dt_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.32441200324412
Testing Accuracy : 0.32684509326845096
ROC AUC Score : 0.5787680231587152
******************************************************
CONFUSION MATRIX
[[ 666 2458]
 [  32  543]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.95      0.21      0.35      3124
           1       0.18      0.94      0.30       575

    accuracy                           0.33      3699
   macro avg       0.57      0.58      0.33      3699
weighted avg       0.83      0.33      0.34      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.6985474409913719
Standard Deviation : 0.04278649094907263
Mean precision score : 0.6757644657229057
Standard Deviation precision score : 0.0730769219329766
Mean recall score : 0.8003323533412943
Standard Deviation recall score : 0.0929437171868205
Mean f1 score : 0.7253426113353685
Stand

# Random Forest upsampled data

In [146]:
rf_classifier_up = RandomForestClassifier(n_estimators=200)
rf_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [147]:
y_pred=rf_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", rf_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", rf_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = rf_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.5064303093500174
Testing Accuracy : 0.5252771019194377
ROC AUC Score : 0.7054692980014474
******************************************************
CONFUSION MATRIX
[[1387 1737]
 [  19  556]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.99      0.44      0.61      3124
           1       0.24      0.97      0.39       575

    accuracy                           0.53      3699
   macro avg       0.61      0.71      0.50      3699
weighted avg       0.87      0.53      0.58      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9544891714388886
Standard Deviation : 0.006229031739294945
Mean precision score : 0.9222246822323855
Standard Deviation precision score : 0.010224598864306317
Mean recall score : 0.9925778829148844
Standard Deviation recall score : 0.0037484476208835284
Mean f1 score : 0.956077453965474

# Random forest hyper parameter tuning

In [149]:
## Hyper Parameter Tuning
print('Parameters currently in use:\n')
print(rf_classifier_up.get_params())
print("\n")

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 5)] ## play with start and stop

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 20,num = 5)] ## change 10,20 and 2
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,10]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = rf_classifier_up, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_upsampled,y_baseTrain_upsampled)
print("Best Parameters are:",rf_random.best_params_)

Parameters currently in use:

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


{'n_estimators': [50, 112, 175, 237, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 12, 15, 17, 20, None], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 4, 10], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.4min finished


Best Parameters are: {'n_estimators': 112, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}


# Best hyper parameter tuned RF upsampled data 

In [150]:
best_random = rf_random.best_estimator_
best_random.fit(X_train_upsampled,y_baseTrain_upsampled)


y_pred=best_random.predict(X_test)

# evaluating the model
print("Training Accuracy :", best_random.score(X_train, y_baseTrain))
print("Testing Accuracy :", best_random.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = best_random, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.5210288494960028
Testing Accuracy : 0.539875642065423
ROC AUC Score : 0.7084359516784502
******************************************************
CONFUSION MATRIX
[[1449 1675]
 [  27  548]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.98      0.46      0.63      3124
           1       0.25      0.95      0.39       575

    accuracy                           0.54      3699
   macro avg       0.61      0.71      0.51      3699
weighted avg       0.87      0.54      0.59      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9548321070314062
Standard Deviation : 0.006852974138608084
Mean precision score : 0.9230462351465893
Standard Deviation precision score : 0.011117413111612072
Mean recall score : 0.992303157640159
Standard Deviation recall score : 0.00326374616739111
Mean f1 score : 0.9563891602866548
St

# Voting Classifier

In [152]:
## Bagging on multiple classifiers:
from sklearn.ensemble import VotingClassifier

lr_vc = LogisticRegression(C=1.0,solver='newton-cg')
svm_vc=SVC(C=1000, gamma=1,kernel='rbf')
knn_vc=KNeighborsClassifier(algorithm='auto', leaf_size = 30, n_jobs = -1, n_neighbors = 6, weights = 'distance')
nb_vc=GaussianNB()
dt_vc=DecisionTreeClassifier(criterion = 'gini')
rf_vc=RandomForestClassifier(n_estimators=112,min_samples_split=2,min_samples_leaf=1,max_features='sqrt', 
                          max_depth=None,bootstrap=True)

evc = VotingClassifier(estimators= [('lr',lr_vc),('dt',dt_vc),('svm',svm_vc),('knn',knn_vc),('nb',nb_vc),('rf',rf_vc)], 
                       voting = 'hard')
evc.fit(X_train_upsampled,y_baseTrain_upsampled)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=...
                                        

In [153]:
y_pred=evc.predict(X_test)

# evaluating the model
print("Training Accuracy :", evc.score(X_train, y_baseTrain))
print("Testing Accuracy :", evc.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = evc, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.8113775923994903
Testing Accuracy : 0.804271424709381
ROC AUC Score : 0.8656755553081333
******************************************************
CONFUSION MATRIX
[[2426  698]
 [  26  549]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      3124
           1       0.44      0.95      0.60       575

    accuracy                           0.80      3699
   macro avg       0.71      0.87      0.74      3699
weighted avg       0.90      0.80      0.83      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9559971478268467
Standard Deviation : 0.005375558595683861
Mean precision score : 0.9269945821988437
Standard Deviation precision score : 0.009281322648251865
Mean recall score : 0.989828740722826
Standard Deviation recall score : 0.0035967253946057863
Mean f1 score : 0.9573528520696168


# Xgboost with Upsampled data

In [154]:
from xgboost import XGBClassifier

xgb_classifier_up = XGBClassifier()
xgb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [156]:
y_pred=xgb_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", xgb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", xgb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = xgb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.2546634225466342
Testing Accuracy : 0.2654771559881049
ROC AUC Score : 0.5651408450704225
******************************************************
CONFUSION MATRIX
[[ 407 2717]
 [   0  575]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      0.13      0.23      3124
           1       0.17      1.00      0.30       575

    accuracy                           0.27      3699
   macro avg       0.59      0.57      0.26      3699
weighted avg       0.87      0.27      0.24      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.8665534170278008
Standard Deviation : 0.009579728176990309
Mean precision score : 0.8612953170889958
Standard Deviation precision score : 0.013600608345141528
Mean recall score : 0.8732828725607267
Standard Deviation recall score : 0.010884639203015398
Mean f1 score : 0.8671674271386914

# XGBoost Hyper parameter tuning

In [157]:
#Default base_score=0.5, booster='gbtree', colsample_bylevel=1,
              #colsample_bynode=1, colsample_bytree=1, gamma=0,
              #learning_rate=0.1, max_delta_step=0, max_depth=3,
              #min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              #nthread=None, objective='binary:logistic', random_state=0,
              #reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              #silent=None, subsample=1, verbosity=1

#making the instance
model= XGBClassifier()
#Hyper Parameters Set
params = { 
          'n_estimators': [100,150,200],
          'learning_rate': [1,0.1,0.01,0.001]
        }
#Making models with hyper parameters sets
classifier = GridSearchCV(model, param_grid=params, n_jobs=-1)

#Learning
classifier.fit(X_train_upsampled,y_baseTrain_upsampled)
#The best hyper parameters set

print("Best Hyper Parameters:",classifier.best_params_)



Best Hyper Parameters: {'learning_rate': 1, 'n_estimators': 200}


# XGBoost hyperparameter tuned model with upsampled data improve ?

In [158]:
xgb_classifier_up = XGBClassifier(learning_rate=1,n_estimators=200)
xgb_classifier_up.fit(X_train_upsampled,y_baseTrain_upsampled)

y_pred=xgb_classifier_up.predict(X_test)

# evaluating the model
print("Training Accuracy :", xgb_classifier_up.score(X_train, y_baseTrain))
print("Testing Accuracy :", xgb_classifier_up.score(X_test, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, y_pred))
print("******************************************************")

print("CONFUSION MATRIX")
print(confusion_matrix(y_baseTest,y_pred))
print("******************************************************")
print("CLASSIFICATION REPORT")
print(classification_report(y_baseTest,y_pred))
print("******************************************************")

# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'AUC ROC score' : make_scorer(roc_auc_score)}
cvs = cross_validate(estimator = xgb_classifier_up, X = X_train_upsampled, y = y_baseTrain_upsampled, cv = 10,scoring=scoring)
#print(cvs)
print("CROSS VALIDATION METRICS")
print("Mean Accuracy :", np.mean(cvs['test_accuracy']))
print("Standard Deviation :", np.std(cvs['test_accuracy']))
print("Mean precision score :", np.mean(cvs['test_precision']))
print("Standard Deviation precision score :", np.std(cvs['test_precision']))
print("Mean recall score :", np.mean(cvs['test_recall']))
print("Standard Deviation recall score :", np.std(cvs['test_recall']))
print("Mean f1 score :", np.mean(cvs['test_f1_score']))
print("Standard Deviation f1 score :", np.std(cvs['test_f1_score']))
print("Mean AUC ROC Score :", np.mean(cvs['test_AUC ROC score']))
print("Standard Deviation AUC ROC Score :", np.std(cvs['test_AUC ROC score']))

Training Accuracy : 0.39033715676051445
Testing Accuracy : 0.40524466071911325
ROC AUC Score : 0.5953832878695096
******************************************************
CONFUSION MATRIX
[[ 998 2126]
 [  74  501]]
******************************************************
CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.93      0.32      0.48      3124
           1       0.19      0.87      0.31       575

    accuracy                           0.41      3699
   macro avg       0.56      0.60      0.39      3699
weighted avg       0.82      0.41      0.45      3699

******************************************************
CROSS VALIDATION METRICS
Mean Accuracy : 0.9309802852013449
Standard Deviation : 0.007073306433178043
Mean precision score : 0.8993686402438644
Standard Deviation precision score : 0.01065692119945592
Mean recall score : 0.9703124763819397
Standard Deviation recall score : 0.005473999046897415
Mean f1 score : 0.933458020137253

# STACKING

In [168]:
## Algorithm 1: xgboost
pred_val_xgb=xgb_classifier_up.predict(X_train_upsampled)
test_pred_xgb=xgb_classifier_up.predict(X_test_upsampled )

## Algorithm 2: Random Forest
pred_val_rf=best_random.predict(X_train_upsampled)
test_pred_rf=best_random.predict(X_test_upsampled )

## Algorithm 3: Decision Tree
pred_val_dt=dt_classifier_up.predict(X_train_upsampled)
test_pred_dt=dt_classifier_up.predict(X_test_upsampled )

stacked_predictions=np.column_stack((pred_val_rf,pred_val_xgb,pred_val_dt)) ## Prediction by the algorithms on training data
#stacked_predictions[0:10]
stacked_test_predictions=np.column_stack((test_pred_rf,test_pred_xgb,test_pred_dt)) ## Prediction by the algorithms on testing data
#stacked_test_predictions[0:10]

## Building Meta Model
lr_stack = LogisticRegression()
lr_stack.fit(stacked_predictions,y_baseTrain_upsampled) ## stacked_predictions=X_train

y_pred_stack=lr_stack.predict(stacked_test_predictions) ## stacked_test_predictions=X_test
y_pred_stack


# evaluating the model
cm = confusion_matrix(y_baseTest_upsampled, y_pred_stack)
print(cm)
print(classification_report(y_baseTest_upsampled, y_pred_stack)) 

accuracies_lr= cross_val_score(estimator = lr_stack, X = stacked_predictions, y = y_baseTrain_upsampled, cv = 10) 
accuracies_lr_mean=accuracies_lr.mean()*100
print("Accuracy Stacking=",accuracies_lr_mean)

accuracies_lr_std=accuracies_lr_mean.std()*100
print("Standard Deviation lr=",accuracies_lr_std)



[[2841  267]
 [  10 3136]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.95      3108
           1       0.92      1.00      0.96      3146

    accuracy                           0.96      6254
   macro avg       0.96      0.96      0.96      6254
weighted avg       0.96      0.96      0.96      6254

Accuracy Stacking= 100.0
Standard Deviation lr= 0.0




# BLENDING

In [169]:
## Algorithm 1: xgboost
pred_val_xgb=xgb_classifier_up.predict(X_train_upsampled)
test_pred_xgb=xgb_classifier_up.predict(X_test_upsampled )

## Algorithm 2: Random Forest
pred_val_rf=best_random.predict(X_train_upsampled)
test_pred_rf=best_random.predict(X_test_upsampled )

## Algorithm 3: Decision Tree
pred_val_dt=dt_classifier_up.predict(X_train_upsampled)
test_pred_dt=dt_classifier_up.predict(X_test_upsampled )

stacked_predictions=np.column_stack((pred_val_rf,pred_val_xgb,pred_val_dt)) ## Prediction by the algorithms on training data
#stacked_predictions[0:10]
stacked_test_predictions=np.column_stack((test_pred_rf,test_pred_xgb,test_pred_dt)) ## Prediction by the algorithms on testing data
#stacked_test_predictions[0:10]

stacked_predictions=pd.DataFrame(stacked_predictions)
stacked_test_predictions=pd.DataFrame(stacked_test_predictions)


## Building Meta Model
lr_stack = LogisticRegression()
lr_stack.fit(stacked_predictions,y_baseTrain_upsampled) ## stacked_predictions=X_train

y_pred_stack=lr_stack.predict(stacked_test_predictions) ## stacked_test_predictions=X_test
y_pred_stack


# evaluating the model
cm = confusion_matrix(y_baseTest_upsampled, y_pred_stack)
print(cm)
print(classification_report(y_baseTest_upsampled, y_pred_stack)) 

accuracies_lr= cross_val_score(estimator = lr_stack, X = stacked_predictions, y = y_baseTrain_upsampled, cv = 10) 
accuracies_lr_mean=accuracies_lr.mean()*100
print("Accuracy Stacking=",accuracies_lr_mean)

accuracies_lr_std=accuracies_lr_mean.std()*100
print("Standard Deviation lr=",accuracies_lr_std)



[[2841  267]
 [  10 3136]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.95      3108
           1       0.92      1.00      0.96      3146

    accuracy                           0.96      6254
   macro avg       0.96      0.96      0.96      6254
weighted avg       0.96      0.96      0.96      6254

Accuracy Stacking= 100.0
Standard Deviation lr= 0.0


