In [1]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change



In [2]:
trainfile = r'C:\Users\farha\Downloads\Portugese Bank Data - TEST.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'C:\Users\farha\Downloads\Portugese Bank Data - TRAIN.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)


(45211, 17)
(4521, 17)


In [3]:
TrainCols = list(trainData.columns.values)
TestCols = list(testData.columns.values)
print(TrainCols)
print(TestCols)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [4]:
# Seperate Target column from Train Data
Xtrain = trainData[TrainCols[0:len(TrainCols)-1]]
Ytrain = trainData[['y']]
print("Train Set shape:")
print(Xtrain.shape)
print(Ytrain.shape)
Xtest = testData[TestCols[0:len(TestCols)-1]]
Ytest = testData[['y']]
print("Test Set shape:")
print(Xtest.shape)
print(Ytest.shape)

Train Set shape:
(45211, 16)
(45211, 1)
Test Set shape:
(4521, 16)
(4521, 1)


In [5]:
print(Ytrain['y'].value_counts())

no     39922
yes     5289
Name: y, dtype: int64


In [6]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


In [7]:
col=Xtrain.columns.values
col

array(['age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration',
       'campaign', 'pdays', 'previous', 'poutcome'], dtype=object)

In [8]:
categoricalFeatures = ['job', 'marital', 'education', 'default','housing', 'loan', 'contact','month','poutcome']

In [9]:
# OneHotEncoding on Train (fit & transform)
# OneHotEncoding is to be done on Categorical variables.
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[categoricalFeatures]),columns=ohe.get_feature_names(),index=Xtrain.index)
Xtrain = pd.concat([Xtrain,Xcat],axis=1)
Xtrain.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtrain.sample(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,...,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
18402,39,284,31,180,8,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2740,42,243,14,46,5,-1,0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34936,32,290,6,150,1,-1,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
40649,25,557,5,813,2,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7848,40,0,30,376,3,-1,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# OneHotEncoding on Test (only transform)
# OneHotEncoding is to be done on Categorical variables.
Xcat = pd.DataFrame(ohe.transform(Xtest[categoricalFeatures]),columns=ohe.get_feature_names(),index=Xtest.index)
Xtest = pd.concat([Xtest,Xcat],axis=1)
Xtest.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtest.sample(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,...,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
4335,37,105,17,260,4,-1,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2884,23,5,12,413,1,-1,0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1159,31,62,18,175,1,293,5,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3667,30,35,30,185,4,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4063,32,485,26,98,14,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
dt = DecisionTreeClassifier()
dt.fit(Xtrain, Ytrain)

In [12]:
rf = RandomForestClassifier()
rf.fit(Xtrain, Ytrain)

In [13]:
X_Pred = dt.predict(Xtest)
XPred = dt.predict(Xtrain)
#Model Accuracy
print("Train Accuracy:", metrics.accuracy_score(Ytrain,XPred))
print("Test Accuracy:", metrics.accuracy_score(Ytest,X_Pred))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,X_Pred))
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, X_Pred))

Train Accuracy: 1.0
Test Accuracy: 1.0
Confusion Matrix for Decision Tree:
[[4000    0]
 [   0  521]]
Max Depth 39
Leaf 3622
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       1.00      1.00      1.00      4000
         yes       1.00      1.00      1.00       521

    accuracy                           1.00      4521
   macro avg       1.00      1.00      1.00      4521
weighted avg       1.00      1.00      1.00      4521



In [14]:
X_Pred1 = rf.predict(Xtest)
XPred1 = rf.predict(Xtrain)
#Model Accuracy
print("Train Accuracy:", metrics.accuracy_score(Ytrain,XPred1))
print("Test Accuracy:", metrics.accuracy_score(Ytest,X_Pred1))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,X_Pred1))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, X_Pred1))

Train Accuracy: 1.0
Test Accuracy: 1.0
Confusion Matrix for Decision Tree:
[[4000    0]
 [   0  521]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       1.00      1.00      1.00      4000
         yes       1.00      1.00      1.00       521

    accuracy                           1.00      4521
   macro avg       1.00      1.00      1.00      4521
weighted avg       1.00      1.00      1.00      4521



In [15]:
#Hyperparameter tuning done for decision tree classifier

#RANDOM SEARCH--------------------------------------------

import time
start_time = time.time()

print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,300,10),
            'max_depth': range(5,30,2),
            'criterion':['gini','entropy']}
dt_random = RandomizedSearchCV(dt,parameters,n_iter=25,cv=5)
dt_random.fit(Xtrain, Ytrain)
dt_random.cv_results_
df1=pd.DataFrame(dt_random.cv_results_)
grid_parm=dt_random.best_params_
print(grid_parm)
print("accuracy Score for Decision Tree:{0:6f}".
      format(dt_random.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

RandomizedSearchCV-Decision tree
{'min_samples_leaf': 250, 'max_depth': 23, 'criterion': 'entropy'}
accuracy Score for Decision Tree:0.901570
--- 53.29457688331604 seconds ---


In [16]:
df1[['params','rank_test_score']]

Unnamed: 0,params,rank_test_score
0,"{'min_samples_leaf': 250, 'max_depth': 23, 'cr...",1
1,"{'min_samples_leaf': 20, 'max_depth': 25, 'cri...",24
2,"{'min_samples_leaf': 160, 'max_depth': 19, 'cr...",17
3,"{'min_samples_leaf': 200, 'max_depth': 19, 'cr...",12
4,"{'min_samples_leaf': 180, 'max_depth': 29, 'cr...",16
5,"{'min_samples_leaf': 60, 'max_depth': 21, 'cri...",19
6,"{'min_samples_leaf': 40, 'max_depth': 13, 'cri...",20
7,"{'min_samples_leaf': 160, 'max_depth': 13, 'cr...",13
8,"{'min_samples_leaf': 190, 'max_depth': 7, 'cri...",4
9,"{'min_samples_leaf': 50, 'max_depth': 29, 'cri...",22


In [18]:
#GRID SEARCH----------------------------------------

import time
start_time = time.time()

print("GridSearchCV-Decision tree")
dt_grid = GridSearchCV(dt,parameters,cv=2)
dt_grid.fit(Xtrain, Ytrain)
grid_parm1=dt_grid.best_params_
#dt_grid.cv_results_
#df2=pd.DataFrame(dt_grid.cv_results_)
print(grid_parm1)
print("accuracy Score for Decision Tree:{0:6f}".
      format(dt_grid.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 260}
accuracy Score for Decision Tree:0.899137
--- 517.3458313941956 seconds ---


In [19]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
dtRand = DecisionTreeClassifier(**grid_parm)
dtGrid = DecisionTreeClassifier(**grid_parm1)

dtRand.fit(Xtrain,Ytrain)
dtRand_predict = dtRand.predict(Xtest)
dtGrid.fit(Xtrain,Ytrain)
dtGrid_predict = dtGrid.predict(Xtest)

In [20]:
# Accuracy for Decision Tree using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,dtRand_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,dtRand_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, dtRand_predict))
clf_cv_score = cross_val_score(dtRand, Xtrain, Ytrain, cv=5, scoring="balanced_accuracy")
print(clf_cv_score)

Test Accuracy: 0.9015704490157045
Confusion Matrix for Decision Tree:
[[3860  140]
 [ 305  216]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.93      0.96      0.95      4000
         yes       0.61      0.41      0.49       521

    accuracy                           0.90      4521
   macro avg       0.77      0.69      0.72      4521
weighted avg       0.89      0.90      0.89      4521

[0.5415879  0.48124073 0.52829406 0.51698814 0.71263321]


In [21]:
#Hyperparameter tuning done for random forest classifier

#RANDOM SEARCH--------------------------------------------

import time
start_time = time.time()

print("RandomizedSearchCV-Random forest")
rand_parameters={'min_samples_leaf' : range(10,100,10),
                 'max_depth': range(1,10,2),
                 'max_features':[10,20,30],
                 'n_estimators':[20,30,40]}
rf_random = RandomizedSearchCV(rf,rand_parameters,n_iter=25,cv=5)
rf_random.fit(Xtrain, Ytrain)
grid_parm=rf_random.best_params_
print(grid_parm)
print("accuracy Score for Decision Tree:{0:6f}".
      format(rf_random.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

RandomizedSearchCV-Random forest
{'n_estimators': 40, 'min_samples_leaf': 30, 'max_features': 10, 'max_depth': 1}
accuracy Score for Decision Tree:0.884760
--- 202.7193467617035 seconds ---


In [26]:
import time
start_time = time.time()

print("GridSearchCV-Random Forest")
rf_grid = GridSearchCV(rf,rand_parameters,cv=2)
rf_grid.fit(Xtrain, Ytrain)
grid_parm1=rf_grid.best_params_
print(grid_parm1)
print("accuracy Score for Decision Tree:{0:6f}".
      format(rf_grid.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

GridSearchCV-Random Forest
{'max_depth': 1, 'max_features': 10, 'min_samples_leaf': 70, 'n_estimators': 30}
accuracy Score for Decision Tree:0.884760
--- 842.5328571796417 seconds ---


In [27]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier 
rfRand = RandomForestClassifier(**grid_parm)
rfGrid = RandomForestClassifier(**grid_parm1)

rfRand.fit(Xtrain,Ytrain)
rfRand_predict = rfRand.predict(Xtest)
rfGrid.fit(Xtrain,Ytrain)
rfGrid_predict = rfGrid.predict(Xtest)

In [28]:
# Accuracy for Random Forest using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,rfRand_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,rfRand_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, rfRand_predict))
clf_cv_score = cross_val_score(rfRand, Xtrain, Ytrain, cv=5, scoring="balanced_accuracy")
print(clf_cv_score)

Test Accuracy: 0.8847600088476001
Confusion Matrix for Decision Tree:
[[4000    0]
 [ 521    0]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.88      1.00      0.94      4000
         yes       0.00      0.00      0.00       521

    accuracy                           0.88      4521
   macro avg       0.44      0.50      0.47      4521
weighted avg       0.78      0.88      0.83      4521

[0.5 0.5 0.5 0.5 0.5]


In [29]:
# Accuracy for Random Forest using Grid Search for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,rfGrid_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,rfGrid_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, rfGrid_predict))

Test Accuracy: 0.8847600088476001
Confusion Matrix for Decision Tree:
[[4000    0]
 [ 521    0]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.88      1.00      0.94      4000
         yes       0.00      0.00      0.00       521

    accuracy                           0.88      4521
   macro avg       0.44      0.50      0.47      4521
weighted avg       0.78      0.88      0.83      4521

