In [0]:
# ANN
# dataset description :-
# fictional dataset of a bank having 10,000 customers and therefore, 10,000 rows
# churn rate (rate at which people are leaving the bank) - we need to predict which of the customers are
# at highest risk of leaving - classification problem
# columns correspond to diff features of the customers like gender, country, num of products, age, balance, etc
# last column is whether the customer exited or not

In [0]:
# MADE IN GOOGLE COLABORATORY
# final model saved as "FINAL_CLASSIFIER.h5"

DATA PREPROCESSING

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
dataset = pd.read_csv('Churn_Modelling.csv')

In [54]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [55]:
X = dataset.iloc[:, 3:13].values # rowNumber, customerId and surname play NO role in churn rate 
y = dataset.iloc[:, 13].values
X.shape, y.shape

((10000, 10), (10000,))

In [56]:
X[0]

array([619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
      dtype=object)

In [57]:
y

array([1, 0, 1, ..., 1, 1, 0])

In [58]:
y[0] # y contains values 0 or 1

1

In [0]:
# encode categorical variables (country and gender) before splitting the data 

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#encode country
encoder1 = LabelEncoder()
X[:,1] = encoder1.fit_transform(X[:,1])

# encode gender - we'll not onehot encode this cz it only has 2 categories - since we'll be removing one of the columns to avoid 
# dummy variable trap, it'll be no use to one hot encode this
encoder2 = LabelEncoder()
X[:,2] = encoder2.fit_transform(X[:,2])

In [61]:
# one hot encode country
ohe = OneHotEncoder(categorical_features = [1])
X = ohe.fit_transform(X).toarray()
X[0], X[0].shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
        0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05]), (12,))

In [62]:
# remove dummy variable we get after ohe
X = X[:,1:] # will remove 1st column
X[0], X[0].shape

(array([0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
        4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0134888e+05]), (11,))

In [63]:
X.shape, y.shape

((10000, 11), (10000,))

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [65]:
X_train.shape, y_train.shape

((8000, 11), (8000,))

In [66]:
X_test.shape, y_test.shape

((2000, 11), (2000,))

In [0]:
# do feature scaling - we dont want to have one independent var dominating another one
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [68]:
X_train[0]

array([-0.5698444 ,  1.74309049,  0.16958176, -1.09168714, -0.46460796,
        0.00666099, -1.21571749,  0.8095029 ,  0.64259497, -1.03227043,
        1.10643166])

In [69]:
X_test[0]

array([ 1.75486502, -0.57369368, -0.55204276, -1.09168714, -0.36890377,
        1.04473698,  0.8793029 , -0.92159124,  0.64259497,  0.9687384 ,
        1.61085707])

BUILDING THE ANN MODEL

In [0]:
import keras
from keras.models import Sequential 
from keras.layers import Dense

In [21]:
classifier = Sequential() # initialising the model

W0728 07:55:27.237307 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [22]:
# add input layer and first hidden layer
# no of nodes in input layer = 11 ( X has 11 features)
# no of nodes in output layer = 1 ( coresponding to y value)
# choose 6 for no of nodes in hidden layer ( average of 11 and 1)
classifier.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu', input_dim = 11)) 

W0728 07:55:27.299245 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0728 07:55:27.323942 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [0]:
# add second hidden layer
classifier.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu')) 

In [0]:
# add output layer
classifier.add(Dense(units = 1, kernel_initializer='uniform', activation = 'sigmoid'))

In [25]:
# compile the ann
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# optimizer - the algo we want to use to find the optimal set of weights in the nn - adam optimizer is a v efficient type
# of stochastic grad optimization
# loss - the loss function used within the sgd adam algo 
# binary_crossentropy - used with binary classification
# categorical_crossentropy - used with multiclass classification
# metrics - metrics used by algo after every iteration to improve performance

W0728 07:55:27.428335 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0728 07:55:27.457488 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0728 07:55:27.466404 139705763219328 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
# fitting the ann to the training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

W0728 07:55:27.768816 139705763219328 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f0f75408e10>

In [0]:
# save the model so we dont have to train again and again
import pickle
classifier.save("classifier.h5")

In [0]:
# accuracy converging at about 86%

MAKING PREDICTIONS AND EVALUATING THE MODEL

In [0]:
y_pred = classifier.predict(X_test)

In [30]:
y_pred # gives probabilities that a customer will leave the bank

array([[0.19588643],
       [0.32081297],
       [0.09965912],
       ...,
       [0.21925613],
       [0.11504003],
       [0.27958232]], dtype=float32)

In [31]:
# convert y_pred in the form true/false
y_pred = (y_pred > 0.5) # return true if y_pred > 0.5
y_pred, y_pred.shape

(array([[False],
        [False],
        [False],
        ...,
        [False],
        [False],
        [False]]), (2000, 1))

In [32]:
# make the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1507,   88],
       [ 196,  209]])

In [33]:
# out of 2000 observations, we get 1544+146 correct predictions and 51+259 wrong predictions
# accuracy = (no of correct preds)/(no of total preds)
accuracy = (1544+146)/2000
accuracy # on test set

0.845

In [0]:
# results
# training set accuracy = ~86%
# test set accuracy = ~84%

PREDICTING RESULTS FOR NEW TEST POINT

In [0]:
new_test_point = np.array([[0,0,600,1,40,3,60000,2,1,1,50000]])
new_test_point = sc.transform(new_test_point) # scaling
#new_test_point = new_test_point.reshape(-1,1)
new_pred = classifier.predict(new_test_point)

In [36]:
new_pred

array([[0.03879036]], dtype=float32)

In [37]:
new_pred = (new_pred > 0.5)
new_pred

array([[False]])

In [0]:
# the new customer doesnt leave the bank

EVALUATING THE MODEL

In [0]:
# apply kfold cross validation

In [0]:
from keras.wrappers.scikit_learn import KerasClassifier
# we are training the model using keras and using kfold cross validation which belongs to scklearn - KerasClassifier is a wrapper
# that helps us combine these 2 libraries"

In [0]:
from sklearn.model_selection import cross_val_score

In [0]:
def build_classifier():
    clf = Sequential()
    clf.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu', input_dim = 11)) 
    clf.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu')) 
    clf.add(Dense(units = 1, kernel_initializer='uniform', activation = 'sigmoid'))
    clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return clf

In [0]:
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100)

In [0]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 2, n_jobs  =-1)

In [0]:
accuracies2 = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs  =-1)
# n_jobs = -1 means all CPUs are used
# cv - 10 fold

In [46]:
accuracies2

array([0.85749999, 0.83749999, 0.88      , 0.82625   , 0.86999999,
       0.83      , 0.83375   , 0.85875   , 0.81125   , 0.84749999])

In [47]:
mean = accuracies2.mean()
mean

0.8452499948441983

In [48]:
variance = accuracies2.std()
variance

0.020246912674531055

In [0]:
# save the accuracies array
np.save('10_fold_cross_val_accuracies', accuracies2)

In [0]:
# load the accuracies object
accuracies2 = np.load('10_fold_cross_val_accuracies.npy')

In [0]:
# no need to apply dropout regularization since all accuracies in cross validation quite close to each other - no 
# overfitting is there

TUNING THE ANN

In [0]:
# apply parameter tuning
# grid search - will find out best values for the hyperparameters by trying out different combinations

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
def build_classifier_gs():
    clf = Sequential()
    clf.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu', input_dim = 11)) 
    clf.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu')) 
    clf.add(Dense(units = 1, kernel_initializer='uniform', activation = 'sigmoid'))
    clf.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return clf

In [0]:
clf_tune = KerasClassifier(build_fn = build_classifier_gs)

In [0]:
def build_classifier_grid_search():
  # dictionary consisting of different hyperparameters and their values we want to test out - grid search will return the best combinations out of these values
  parameters = {'batch_size' : [10, 25, 32],
             'nb_epoch' : [100,250, 500]}
  grid_search = GridSearchCV(estimator = clf_tune, param_grid = parameters, scoring = 'accuracy', cv = 10)
  return grid_search

In [107]:
grid_search = build_classifier_grid_search()
grid_search

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f582ccc87b8>,
             iid='warn', n_jobs=None,
             param_grid={'batch_size': [10, 25, 32],
                         'nb_epoch': [100, 250, 500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [0]:
grid_search = grid_search.fit(X_train, y_train)

In [0]:
best_parameters = grid_search.best_params_

In [0]:
best_accuracy = grid_search.best_score_

PARAMETERS SELECTED BY RUNNING GRID SEARCH WERE :
batch_size = 25
nb_epoch = 500

BUILDING THE ANN MODEL USING PARAMETERS SELECTED BY GRID SEARCH

In [109]:
clf_grid = Sequential()
clf_grid.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu', input_dim = 11))
clf_grid.add(Dense(units = 6, kernel_initializer='uniform', activation = 'relu')) 
clf_grid.add(Dense(units = 1, kernel_initializer='uniform', activation = 'sigmoid'))
clf_grid.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
clf_grid.fit(X_train, y_train, batch_size = 25, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f580b101f60>

Final accuracy obtained - 86.15%

In [0]:
import pickle
clf_grid.save("FINAL_CLASSIFIER.h5")

In [0]:
# end