# Loyalty of a Bank Customer - Case Study
Problem Statement:
    * Given a Bank customer, can we build a classifier which can determine whether they will leave or not, using Neural networks?

Case File:
    * bank.csv

The points distribution for this case is as follows:

In [1]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import warnings;
warnings.simplefilter('ignore')

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier

tf.__version__
tf.set_random_seed(100)

Using TensorFlow backend.


Q1. Read the dataset

In [2]:
bank=pd.read_csv("bank.csv")
bank.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
RowNumber,1,2,3,4,5,6,7,8,9,10
CustomerId,15634602,15647311,15619304,15701354,15737888,15574012,15592531,15656148,15792365,15592389
Surname,Hargrave,Hill,Onio,Boni,Mitchell,Chu,Bartlett,Obinna,He,H?
CreditScore,619,608,502,699,850,645,822,376,501,684
Geography,France,Spain,France,France,Spain,Spain,France,Germany,France,France
Gender,Female,Female,Female,Female,Female,Male,Male,Female,Male,Male
Age,42,41,42,39,43,44,50,29,44,27
Tenure,2,1,8,1,2,8,7,4,4,2
Balance,0,83807.9,159661,0,125511,113756,0,115047,142051,134604
NumOfProducts,1,1,3,2,1,2,2,4,2,1


In [3]:
bank.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RowNumber,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
CustomerId,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48


In [4]:
bank.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Q2. Drop the columns which are unique for all users like IDs (5 points)

In [5]:
bank.iloc[:,0].count()

10000

In [6]:
for i in range(np.size(bank.columns)):
    print(bank.columns[i],np.size(bank.iloc[:,i].value_counts()))

RowNumber 10000
CustomerId 10000
Surname 2932
CreditScore 460
Geography 3
Gender 2
Age 70
Tenure 11
Balance 6382
NumOfProducts 4
HasCrCard 2
IsActiveMember 2
EstimatedSalary 9999
Exited 2


In [7]:
bank.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)
bank.head().transpose()

Unnamed: 0,0,1,2,3,4
CreditScore,619,608,502,699,850
Geography,France,Spain,France,France,Spain
Gender,Female,Female,Female,Female,Female
Age,42,41,42,39,43
Tenure,2,1,8,1,2
Balance,0,83807.9,159661,0,125511
NumOfProducts,1,1,3,2,1
HasCrCard,1,0,1,0,1
IsActiveMember,1,1,0,0,1
EstimatedSalary,101349,112543,113932,93826.6,79084.1


Q3. Distinguish the feature and target set (5 points)

In [8]:
bank.Exited=bank.Exited.astype('category')
bank.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [9]:
bank.NumOfProducts.value_counts()

1    5084
2    4590
3     266
4      60
Name: NumOfProducts, dtype: int64

In [10]:
bank.HasCrCard.value_counts()

1    7055
0    2945
Name: HasCrCard, dtype: int64

In [11]:
bank.IsActiveMember.value_counts()

1    5151
0    4849
Name: IsActiveMember, dtype: int64

In [12]:
bank.Gender.value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [13]:
bank.Geography.value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [14]:
lb = LabelEncoder() 

bank.Gender = lb.fit_transform(bank.Gender)
bank.Geography = lb.fit_transform(bank.Geography)

In [15]:
bank.dtypes

CreditScore           int64
Geography             int32
Gender                int32
Age                   int64
Tenure                int64
Balance             float64
NumOfProducts         int64
HasCrCard             int64
IsActiveMember        int64
EstimatedSalary     float64
Exited             category
dtype: object

In [16]:
bank.corr()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
CreditScore,1.0,0.007888,-0.002857,-0.003965,0.000842,0.006268,0.012238,-0.005458,0.025651,-0.001384
Geography,0.007888,1.0,0.004719,0.022812,0.003739,0.069408,0.003972,-0.008523,0.006724,-0.001369
Gender,-0.002857,0.004719,1.0,-0.027544,0.014733,0.012087,-0.021859,0.005766,0.022544,-0.008112
Age,-0.003965,0.022812,-0.027544,1.0,-0.009997,0.028308,-0.03068,-0.011721,0.085472,-0.007201
Tenure,0.000842,0.003739,0.014733,-0.009997,1.0,-0.012254,0.013444,0.022583,-0.028362,0.007784
Balance,0.006268,0.069408,0.012087,0.028308,-0.012254,1.0,-0.30418,-0.014858,-0.010084,0.012797
NumOfProducts,0.012238,0.003972,-0.021859,-0.03068,0.013444,-0.30418,1.0,0.003183,0.009612,0.014204
HasCrCard,-0.005458,-0.008523,0.005766,-0.011721,0.022583,-0.014858,0.003183,1.0,-0.011866,-0.009933
IsActiveMember,0.025651,0.006724,0.022544,0.085472,-0.028362,-0.010084,0.009612,-0.011866,1.0,-0.011421
EstimatedSalary,-0.001384,-0.001369,-0.008112,-0.007201,0.007784,0.012797,0.014204,-0.009933,-0.011421,1.0


In [17]:
bank.isnull().any()

CreditScore        False
Geography          False
Gender             False
Age                False
Tenure             False
Balance            False
NumOfProducts      False
HasCrCard          False
IsActiveMember     False
EstimatedSalary    False
Exited             False
dtype: bool

Q4. Divide the data set into Train and test sets

In [18]:
x=bank.drop(columns=['Exited'])
y=bank.loc[:,['Exited']]

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [20]:
x_train.shape
y_train.shape
x_test.shape
y_test.shape

(7000, 10)

(7000, 1)

(3000, 10)

(3000, 1)

In [21]:
x_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
x_test.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

Q5. Normalize the train and test data (5 points)

In [22]:
mmscale=MinMaxScaler()
mmscale=mmscale.fit(x)
x_train=mmscale.transform(x_train)
x_test=mmscale.transform(x_test)

In [23]:
sm=SMOTE(random_state=2)
x_train_res,y_train_res=sm.fit_sample(x_train,y_train)
x_train_res=pd.DataFrame(x_train_res)
y_train_res=pd.DataFrame(y_train_res)

In [24]:
y_train.Exited.value_counts()
y_train_res[0].value_counts()

0    5571
1    1429
Name: Exited, dtype: int64

1    5571
0    5571
Name: 0, dtype: int64

Q6. Initialize &amp; build the model (20 points)

In [25]:
SeqModel=Sequential()

SeqModel.add(Dense(32,activation='relu',kernel_initializer='random_normal',input_dim=10))
SeqModel.add(Dense(1,activation='sigmoid',kernel_initializer='random_normal'))

SeqModel.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

SeqModel.fit(x_train_res,y_train_res,epochs=22,batch_size=11,verbose=0)

predict = pd.DataFrame(data=x_test,columns=x.columns)
predict['Actual_Excited']=y_test
predict['Predicted_Excited']=SeqModel.predict(x_test)
predict.Predicted_Excited=predict.Predicted_Excited>0.5

loss,accuracy = SeqModel.evaluate(x_test,y_test,verbose=0)
print('Model Accuracy:',accuracy)

print('\nConfusion Matrix:')
pd.DataFrame(data=list(confusion_matrix(predict.Actual_Excited.values,predict.Predicted_Excited.values)),
             columns=['Predicted_0','Predicted_1'],index=['Actual_0','Actual_1'])

print('Precision - Recall - F1 Score - Support Matrix:')
pd.DataFrame(data=list(prfs(predict.Actual_Excited.values,predict.Predicted_Excited.values)),
             columns=['Class_0','Class_1'],index=['Precision','Recall','F1_Score','Support'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


<keras.callbacks.History at 0x2ab053b29e8>

Model Accuracy: 0.7736666668256124

Confusion Matrix:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,1880,512
Actual_1,167,441


Precision - Recall - F1 Score - Support Matrix:


Unnamed: 0,Class_0,Class_1
Precision,0.918417,0.462749
Recall,0.785953,0.725329
F1_Score,0.847038,0.565022
Support,2392.0,608.0


Q7. Optimize the model (10 points)

In [26]:
SeqModel_Optim=Sequential()

SeqModel_Optim.add(Dense(32,activation='relu',input_dim=10))
SeqModel_Optim.add(Dense(1,activation='sigmoid'))

sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9)

SeqModel_Optim.compile(optimizer=sgd,loss='binary_crossentropy',metrics=['accuracy'])

SeqModel_Optim.fit(x_train_res,y_train_res,epochs=20,batch_size=10,verbose=0)

predict['Predicted_Excited_Optimised']=SeqModel_Optim.predict(x_test)
predict.Predicted_Excited_Optimised=predict.Predicted_Excited_Optimised>0.5

loss,accuracy = SeqModel_Optim.evaluate(x_test,y_test,verbose=0)
print('Model Accuracy:',accuracy)

print('\nConfusion Matrix:')
pd.DataFrame(data=list(confusion_matrix(predict.Actual_Excited.values,predict.Predicted_Excited_Optimised.values)),
             columns=['Predicted_0','Predicted_1'],index=['Actual_0','Actual_1'])

print('Precision - Recall - F1 Score - Support Matrix:')
pd.DataFrame(data=list(prfs(predict.Actual_Excited.values,predict.Predicted_Excited_Optimised.values)),
             columns=['Class_0','Class_1'],index=['Precision','Recall','F1_Score','Support'])

<keras.callbacks.History at 0x2ab05915be0>

Model Accuracy: 0.7493333333333333

Confusion Matrix:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,1778,614
Actual_1,138,470


Precision - Recall - F1 Score - Support Matrix:


Unnamed: 0,Class_0,Class_1
Precision,0.927975,0.433579
Recall,0.743311,0.773026
F1_Score,0.825441,0.555556
Support,2392.0,608.0


In [27]:
MLP_Class_Model=MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=12,random_state=0)

MLP_Class_Model.fit(x_train,y_train)

predict['Predicted_Excited_MLP']=MLP_Class_Model.predict(x_test)

accuracy = MLP_Class_Model.score(x_test,y_test)
print('Model Accuracy:',accuracy)

print('\nConfusion Matrix:')
pd.DataFrame(data=list(confusion_matrix(predict.Actual_Excited.values,predict.Predicted_Excited_MLP.values)),
             columns=['Predicted_0','Predicted_1'],index=['Actual_0','Actual_1'])

print('Precision - Recall - F1 Score - Support Matrix:')
pd.DataFrame(data=list(prfs(predict.Actual_Excited.values,predict.Predicted_Excited_MLP.values)),
             columns=['Class_0','Class_1'],index=['Precision','Recall','F1_Score','Support'])

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=12, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

Model Accuracy: 0.8596666666666667

Confusion Matrix:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,2302,90
Actual_1,331,277


Precision - Recall - F1 Score - Support Matrix:


Unnamed: 0,Class_0,Class_1
Precision,0.874288,0.754768
Recall,0.962375,0.455592
F1_Score,0.916219,0.568205
Support,2392.0,608.0


Q8. Predict the results using 0.5 as a threshold (10 points)

Q9. Print the Accuracy score and confusion matrix (5 points)

In [28]:
keras.layers.ReLU(threshold=0.5)
SeqModel_Optim=Sequential()

SeqModel_Optim.add(Dense(32,activation='relu',input_dim=10))
SeqModel_Optim.add(Dense(1,activation='sigmoid'))

sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9)

SeqModel_Optim.compile(optimizer=sgd,loss='binary_crossentropy',metrics=['accuracy'])

SeqModel_Optim.fit(x_train_res,y_train_res,epochs=20,batch_size=10,verbose=0)

predict['Predicted_Excited_Optimised']=SeqModel_Optim.predict(x_test)
predict.Predicted_Excited_Optimised=predict.Predicted_Excited_Optimised>0.5

loss,accuracy = SeqModel_Optim.evaluate(x_test,y_test,verbose=0)
print('Model Accuracy:',accuracy)

print('\nConfusion Matrix:')
pd.DataFrame(data=list(confusion_matrix(predict.Actual_Excited.values,predict.Predicted_Excited_Optimised.values)),
             columns=['Predicted_0','Predicted_1'],index=['Actual_0','Actual_1'])

print('Precision - Recall - F1 Score - Support Matrix:')
pd.DataFrame(data=list(prfs(predict.Actual_Excited.values,predict.Predicted_Excited_Optimised.values)),
             columns=['Class_0','Class_1'],index=['Precision','Recall','F1_Score','Support'])

<keras.layers.advanced_activations.ReLU at 0x2ab05be2940>

<keras.callbacks.History at 0x2ab05be2e80>

Model Accuracy: 0.8173333334922791

Confusion Matrix:


Unnamed: 0,Predicted_0,Predicted_1
Actual_0,2054,338
Actual_1,210,398


Precision - Recall - F1 Score - Support Matrix:


Unnamed: 0,Class_0,Class_1
Precision,0.907244,0.540761
Recall,0.858696,0.654605
F1_Score,0.882302,0.592262
Support,2392.0,608.0


In [29]:
predict.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
CreditScore,0.558,0.568,0.404,0.66,0.734,0.468,0.474,0.512,0.158,0.692
Geography,1,0.5,0.5,0,0.5,1,0.5,1,0,0
Gender,0,0,1,0,0,0,0,0,0,1
Age,0.162162,0.148649,0.202703,0.189189,0.202703,0.405405,0.283784,0.472973,0.283784,0.135135
Tenure,0.2,0.8,0.3,0.5,1,0.2,0.6,0.1,0.6,0.4
Balance,0.135568,0.518283,0.577775,0,0.407279,0.849533,0.405949,0.435755,0.191408,0.688115
NumOfProducts,0,0.333333,0,0,0.333333,0,0.333333,0,0,0
HasCrCard,1,0,1,1,1,1,1,1,1,1
IsActiveMember,0,1,0,1,0,0,0,1,0,1
EstimatedSalary,0.0978046,0.349223,0.294194,0.753435,0.116113,0.375784,0.035464,0.379278,0.374333,0.582355


In [31]:
predict.iloc[:,10:14].drop_duplicates().sort_values(by=['Actual_Excited',
                                                        'Predicted_Excited',
                                                        'Predicted_Excited_Optimised',
                                                        'Predicted_Excited_MLP']).reset_index(drop=True)

Unnamed: 0,Actual_Excited,Predicted_Excited,Predicted_Excited_Optimised,Predicted_Excited_MLP
0,0,False,False,0
1,0,False,False,1
2,0,False,True,0
3,0,False,True,1
4,0,True,False,0
5,0,True,False,1
6,0,True,True,0
7,0,True,True,1
8,1,False,False,0
9,1,False,True,0
