# Importing Libraries/Connecting to GDrive

In [1]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Mounted at /gdrive
/gdrive


# Exploratory Data Analysis

In [3]:
trainfile = r'/gdrive/My Drive/CIS 508/Assignment 2/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS 508/Assignment 2/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(4521, 17)
(45211, 17)


In [4]:
trainData.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [5]:
trainData.info()
trainData.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [6]:
trainData.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
#List of Features
Features = ["job", 'marital', "education", "default", "balance", 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData,testData], keys=[0,1])

#Do one Hot encoding for features
combined_Data = pd.get_dummies(combined_Data,columns=Features)

#Separate Train data and test data
trainData = combined_Data.xs(0)
testData = combined_Data.xs(1)

y_train = trainData['y']
X_train = trainData.drop(['y'], axis=1) #extracting training data without the target column
y_test = testData['y']
X_test = testData.drop(['y'], axis=1) #extracting testing data without the target column



# Decision Tree Modeling

In [10]:
#Decision Tree Classifier ========================================================================
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (testset) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))



accuracy Score (testset) for Decision Tree:0.880649
Confusion Matrix for Decision Tree
[[38068  1854]
 [ 3542  1747]]


In [None]:
print(clf.feature_importances_)

[0. 0. 0. ... 0. 0. 0.]


###Decision Tree Hyperparameter Tuning 1




In [11]:
#Hyperparameter tuning done for decision tree classifier

#do random search
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(100,1000,100),'max_depth': range(15,50,15),'criterion':['gini','entropy']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=10,cv=10)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

RandomizedSearchCV-Decision tree
{'min_samples_leaf': 100, 'max_depth': 15, 'criterion': 'entropy'}


In [12]:
#Now do grid search
print("GridSearchCV-Decision tree")
parameters2={'min_samples_leaf' : range(10,50,10),'criterion':['gini','entropy'], 'splitter':['best','random']}
clf_grid = GridSearchCV(clf, parameters2, cv=5, verbose=1)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)

GridSearchCV-Decision tree
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.3min finished


{'criterion': 'gini', 'min_samples_leaf': 40, 'splitter': 'best'}


In [13]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)




In [14]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))


accuracy Score (testset) after hypertuning randomized search for Decision Tree:0.892858
accuracy Score (testset) after hypertuning grid search for Decision Tree:0.892261
Confusion Matrix after hypertuning for Decision Tree
[[39389   533]
 [ 4311   978]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.90      0.99      0.94     39922
         yes       0.65      0.18      0.29      5289

    accuracy                           0.89     45211
   macro avg       0.77      0.59      0.61     45211
weighted avg       0.87      0.89      0.87     45211



In [15]:
#cross validation using decision tree model
clf_cv_score = cross_val_score(clf, X_train, y_train, cv=5, 
                               scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')

[0.57470238 0.57423077 0.56230769 0.56230769 0.58572115]




###Decision Tree Hyperparameter Tuning 2




In [16]:
#Hyperparameter tuning done for decision tree classifier

#do random search
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(400,500,40),'max_depth': range(25,100,25),'criterion':['entropy'], 'max_leaf_nodes' : range(10,50,10)}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=4,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

RandomizedSearchCV-Decision tree
{'min_samples_leaf': 480, 'max_leaf_nodes': 20, 'max_depth': 50, 'criterion': 'entropy'}


In [17]:
#Now do grid search
print("GridSearchCV-Decision tree")
parameters2={'min_samples_leaf' : range(30,90,30),'criterion':['gini','entropy'], 'splitter':['random']}
clf_grid = GridSearchCV(clf, parameters2, cv=10, verbose=4, n_jobs=1)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)

GridSearchCV-Decision tree
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.883, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.894, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.894, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.0s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.892, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.883, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.894, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.892, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.892, total=   0.9s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=random, score=0.900, total=   1.0s
[CV] criterion=gini, min_samples_leaf=30, splitter=random ............
[CV]  criterion=gini, min_samples_leaf=30, splitter=

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   37.0s finished


{'criterion': 'gini', 'min_samples_leaf': 60, 'splitter': 'random'}


In [18]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


In [19]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

accuracy Score (testset) after hypertuning randomized search for Decision Tree:0.883015
accuracy Score (testset) after hypertuning grid search for Decision Tree:0.892858
Confusion Matrix after hypertuning for Decision Tree
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211



In [20]:
#cross validation using decision tree model
clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, 
                               verbose=8, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] .................................... , score=0.500, total=   1.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.8s remaining:    0.0s


[CV] .................................... , score=0.500, total=   1.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.8s remaining:    0.0s


[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.7s remaining:    0.0s


[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.7s remaining:    0.0s


[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.6s remaining:    0.0s


[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................
[CV] .................................... , score=0.500, total=   0.9s
[CV]  ................................................................
[CV] .................................... , score=0.500, total=   0.9s
[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]




[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.3s finished


###Decision Tree Hyperparameter Tuning 3


In [25]:
#Hyperparameter tuning done for decision tree classifier

#do random search
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_split' : range(10,1000,10),'max_depth': range(25,100,25),'criterion':['entropy'], 'max_leaf_nodes' : range( 20,300,20)}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=20,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

RandomizedSearchCV-Decision tree
{'min_samples_split': 990, 'max_leaf_nodes': 60, 'max_depth': 25, 'criterion': 'entropy'}


In [27]:
#Now do grid search
print("GridSearchCV-Decision tree")
parameters2={'min_samples_leaf' : range(30,500,30),'criterion':['gini'], 'splitter':['best'], 'max_leaf_nodes' : range( 20,100,20)}
clf_grid = GridSearchCV(clf, parameters2, cv=2, verbose=7)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)

GridSearchCV-Decision tree
Fitting 2 folds for each of 64 candidates, totalling 128 fits
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=30, splitter=best 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=30, splitter=best, score=0.887, total=   1.3s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=30, splitter=best 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=30, splitter=best, score=0.894, total=   1.5s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=60, splitter=best 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=60, splitter=best, score=0.892, total=   1.4s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=60, splitter=best 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.2s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=60, splitter=best, score=0.894, total=   1.2s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=90, splitter=best 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.4s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=90, splitter=best, score=0.885, total=   1.0s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=90, splitter=best 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.4s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=90, splitter=best, score=0.885, total=   1.0s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=120, splitter=best 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.4s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=120, splitter=best, score=0.885, total=   0.9s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=120, splitter=best 
[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=120, splitter=best, score=0.885, total=   0.9s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=150, splitter=best 
[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=150, splitter=best, score=0.885, total=   0.9s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=150, splitter=best 
[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=150, splitter=best, score=0.885, total=   0.9s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=180, splitter=best 
[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=180, splitter=best, score=0.885, total=   0.8s
[CV] criterion=gini, max_leaf_nodes=20, min_samples_leaf=180, splitter=best 
[CV]  criterion=gini, max_leaf_nodes=20, min_samples_leaf=180, splitter=best, score=0.885,

[Parallel(n_jobs=1)]: Done 128 out of 128 | elapsed:  1.8min finished


{'criterion': 'gini', 'max_leaf_nodes': 20, 'min_samples_leaf': 60, 'splitter': 'best'}


In [28]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)

In [29]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

accuracy Score (testset) after hypertuning randomized search for Decision Tree:0.891243
accuracy Score (testset) after hypertuning grid search for Decision Tree:0.892858
Confusion Matrix after hypertuning for Decision Tree
[[39264   658]
 [ 4259  1030]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.90      0.98      0.94     39922
         yes       0.61      0.19      0.30      5289

    accuracy                           0.89     45211
   macro avg       0.76      0.59      0.62     45211
weighted avg       0.87      0.89      0.87     45211



In [30]:
#cross validation using decision tree model
clf_cv_score = cross_val_score(clf, X_train, y_train, cv=4, 
                               scoring="balanced_accuracy", error_score=5)
print(clf_cv_score)
print('\n')

[0.58228626 0.56892308 0.56088462 0.58680769]




# Random Forest Modeling

In [41]:
#Normal randomforest==============================================================================
#=================================================================================================
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (testset) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))


accuracy Score (testset) for RandomForest:0.899670
Confusion Matrix for Random Forest:
[[39786   136]
 [ 4400   889]]


###Random Forest Hyperparameter Tuning 1

In [42]:
#Hyperparameter tuning done for random forest classifier

#do random search
print("RandomizedSearchCV-Random Forest")
r_parameters={'min_samples_split' : range(10,100,10),'max_depth': range(25,100,25),'criterion':['entropy'], 'max_leaf_nodes' : range( 20,200,20)}
rfc_random = RandomizedSearchCV(rfc,r_parameters,n_iter=5,cv=6)
rfc_random.fit(X_train, y_train)
grid_parmR=rfc_random.best_params_
print(grid_parmR)

RandomizedSearchCV-Random Forest
{'min_samples_split': 50, 'max_leaf_nodes': 140, 'max_depth': 75, 'criterion': 'entropy'}


In [43]:
#Now do grid search
print("GridSearchCV-Decision tree")
r_parameters2={'n_estimators': [5, 15, 30, 50],'criterion': ['gini', 'entropy']}
rfc_grid = GridSearchCV(rfc, r_parameters2, n_jobs=-1)
rfc_grid.fit(X_train, y_train)
grid_parmR1=rfc_grid.best_params_
print(grid_parmR1)

GridSearchCV-Decision tree
{'criterion': 'gini', 'n_estimators': 30}


In [44]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
#Construct Random Forest using the best parameters
rfc = RandomForestClassifier(**grid_parmR)
rfcr = RandomForestClassifier(**grid_parmR1)

rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)

In [45]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Random Forest:{0:6f}".format(rfcr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))

accuracy Score (testset) after hypertuning randomized search for Random Forest:0.884298
accuracy Score (testset) after hypertuning grid search for Random Forest:0.898255
Confusion Matrix after hypertuning for Random Forest
[[39914     8]
 [ 5223    66]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.89      0.01      0.02      5289

    accuracy                           0.88     45211
   macro avg       0.89      0.51      0.48     45211
weighted avg       0.89      0.88      0.83     45211



In [46]:
#cross validation using random forest model
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=4, 
                               scoring="balanced_accuracy", error_score=5)
print(rfc_cv_score)
print('\n')

[0.5        0.50384615 0.50769231 0.50384615]




###Random Forest Hyperparameter Tuning 2




In [47]:
#Hyperparameter tuning done for random forest classifier

#do random search
print("RandomizedSearchCV-Random Forest")
r_parameters={'min_samples_split' : range(100,1000,100),'max_depth': range(15,80,15),'criterion':['gini'], 'max_leaf_nodes' : range( 200,1000,200)}
rfc_random = RandomizedSearchCV(rfc,r_parameters,n_iter=5,cv=5)
rfc_random.fit(X_train, y_train)
grid_parmR=rfc_random.best_params_
print(grid_parmR)

RandomizedSearchCV-Random Forest
{'min_samples_split': 400, 'max_leaf_nodes': 600, 'max_depth': 30, 'criterion': 'gini'}


In [48]:
#Now do grid search
print("GridSearchCV-Decision tree")
r_parameters2={'n_estimators': [25, 55, 10, 20],'criterion': ['gini', 'entropy'],}
rfc_grid = GridSearchCV(rfc, r_parameters2, n_jobs=5, verbose=5)
rfc_grid.fit(X_train, y_train)
grid_parmR1=rfc_grid.best_params_
print(grid_parmR1)

GridSearchCV-Decision tree
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   24.0s
[Parallel(n_jobs=5)]: Done  40 out of  40 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  40 out of  40 | elapsed:  1.0min finished


{'criterion': 'entropy', 'n_estimators': 10}


In [49]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
#Construct Random Forest using the best parameters
rfc = RandomForestClassifier(**grid_parmR)
rfcr = RandomForestClassifier(**grid_parmR1)

rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)

In [50]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Decision Tree:{0:6f}".format(rfc.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Decision Tree:{0:6f}".format(rfcr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))

accuracy Score (testset) after hypertuning randomized search for Decision Tree:0.883015
accuracy Score (testset) after hypertuning grid search for Decision Tree:0.899648
Confusion Matrix after hypertuning for Decision Tree
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211



In [52]:
#cross validation using random forest model
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=5, 
                               scoring="balanced_accuracy", n_jobs=3, error_score=3)
print(rfc_cv_score)
print('\n')

[0.5 0.5 0.5 0.5 0.5]




###Random Forest Hyperparameter Tuning 3


In [53]:
#Hyperparameter tuning done for random forest classifier

#do random search
print("RandomizedSearchCV-Random Forest")
r_parameters={'min_samples_split' : range(200,1800,150),'max_depth': range(10,200,80),'criterion':['entropy'], 'max_leaf_nodes' : range( 25,90,20)}
rfc_random = RandomizedSearchCV(rfc,r_parameters,n_iter=7,cv=5, verbose=6)
rfc_random.fit(X_train, y_train)
grid_parmR=rfc_random.best_params_
print(grid_parmR)

RandomizedSearchCV-Random Forest
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy, score=0.884, total=   1.8s
[CV] min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy, score=0.885, total=   1.6s
[CV] min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV]  min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy, score=0.885, total=   1.6s
[CV] min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s


[CV]  min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy, score=0.885, total=   1.6s
[CV] min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.6s remaining:    0.0s


[CV]  min_samples_split=650, max_leaf_nodes=85, max_depth=10, criterion=entropy, score=0.885, total=   1.6s
[CV] min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.2s remaining:    0.0s


[CV]  min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy, score=0.884, total=   1.4s
[CV] min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy 
[CV]  min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy, score=0.885, total=   1.4s
[CV] min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy 
[CV]  min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy, score=0.885, total=   1.4s
[CV] min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy 
[CV]  min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy, score=0.885, total=   1.4s
[CV] min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy 
[CV]  min_samples_split=1400, max_leaf_nodes=25, max_depth=10, criterion=entropy, score=0.885, total=   1.4s
[CV] min_samples_split=1700, max_leaf_nodes=45, max_depth=90, criterion=entropy 
[CV]  min_samples_split=1700, max_leaf_nodes=45, m

[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  1.4min finished


{'min_samples_split': 650, 'max_leaf_nodes': 85, 'max_depth': 10, 'criterion': 'entropy'}


In [55]:
#Now do grid search
print("GridSearchCV-Decision tree")
r_parameters2={'n_estimators': [95, 15, 120, 200],'criterion': ['gini', 'entropy'], 'class_weight':['balanced', 'balanced_subsample']}
rfc_grid = GridSearchCV(rfc, r_parameters2, n_jobs=11, verbose=15, error_score=-2)
rfc_grid.fit(X_train, y_train)
grid_parmR1=rfc_grid.best_params_
print(grid_parmR1)

GridSearchCV-Decision tree
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done   1 tasks      | elapsed:   16.1s
[Parallel(n_jobs=11)]: Done   2 tasks      | elapsed:   16.2s
[Parallel(n_jobs=11)]: Done   3 tasks      | elapsed:   16.3s
[Parallel(n_jobs=11)]: Done   4 tasks      | elapsed:   17.0s
[Parallel(n_jobs=11)]: Done   5 tasks      | elapsed:   17.1s
[Parallel(n_jobs=11)]: Done   6 tasks      | elapsed:   28.8s
[Parallel(n_jobs=11)]: Done   7 tasks      | elapsed:   29.1s
[Parallel(n_jobs=11)]: Done   8 tasks      | elapsed:   29.2s
[Parallel(n_jobs=11)]: Done   9 tasks      | elapsed:   29.2s
[Parallel(n_jobs=11)]: Done  10 tasks      | elapsed:   29.5s
[Parallel(n_jobs=11)]: Done  11 tasks      | elapsed:   47.1s
[Parallel(n_jobs=11)]: Done  12 tasks      | elapsed:   47.4s
[Parallel(n_jobs=11)]: Done  13 tasks      | elapsed:   48.2s
[Parallel(n_jobs=11)]: Done  14 tasks      | elapsed:   48.5s
[Parallel(n_jobs=11)]: Done  15 tasks      | elapsed:  

{'class_weight': 'balanced', 'criterion': 'entropy', 'n_estimators': 120}


In [56]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
#Construct Random Forest using the best parameters
rfc = RandomForestClassifier(**grid_parmR)
rfcr = RandomForestClassifier(**grid_parmR1)

rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)

In [57]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after hypertuning randomized search for Decision Tree:{0:6f}".format(rfc.score(X_test,y_test)))

print("accuracy Score (testset) after hypertuning grid search for Decision Tree:{0:6f}".format(rfcr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))

accuracy Score (testset) after hypertuning randomized search for Decision Tree:0.883015
accuracy Score (testset) after hypertuning grid search for Decision Tree:0.901329
Confusion Matrix after hypertuning for Decision Tree
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211



In [58]:
#cross validation using random forest model
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=10, 
                               scoring="balanced_accuracy", n_jobs=9, 
                               error_score=-1, verbose=3)
print(rfc_cv_score)
print('\n')

[Parallel(n_jobs=9)]: Using backend LokyBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Done   5 out of  10 | elapsed:    7.6s remaining:    7.6s


[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]




[Parallel(n_jobs=9)]: Done  10 out of  10 | elapsed:   14.5s finished
