In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

import keras # based on tensorflow
from keras.models import Sequential # initializes neural network
from keras.layers import Dense # for implementing ANN layers, it also takes care of weight initilization for us
from keras.layers import Dropout

from keras.wrappers.scikit_learn import KerasClassifier # keras wrapper for k-fold-cross validation
from sklearn.model_selection import cross_val_score # normal cross validation
from sklearn.model_selection import GridSearchCV # cross validation for hyperparameter tuning

In [8]:
dataset = pd.read_csv("Churn_Modelling.csv")
X = dataset.iloc[:, 3:13].values # RowNumber, CustomerId, Surname has no effect on weather customer is gonna leave the bank or not
y = dataset.iloc[:, 13].values

# encoding categorical data
# first we change our categorical data (non-numbers) to numeric data
labelencoder_X_1 = LabelEncoder() # country france/germany/spain
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

labelencoder_X_2 = LabelEncoder() # gender male/female
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])

# then we change the country to one-hot encoding
# not doing it for gender since its only two options 0/1
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()

# dummy variable trap, if both 2,3 are zero then it means country is 1
X = X[:, 1:]

In [9]:
#for hidden layers best activation function is usually Relu
#for output layer it can be softmax or sigmoid
classifier = Sequential()

# add the input layer and first hidden layer
# output_dim : how many neurons in hidden layer | init : initial weight distribution | input_dim : how many input nodes
classifier.add(Dense(output_dim=6, init='uniform', activation='relu', input_dim=11))
#dropout regularization, dropout 0.1 of neurons from the layer
classifier.add(Dropout(p=0.1))
# adding second hidden layer
classifier.add(Dense(output_dim=6, init='uniform', activation='relu'))
#dropout regularization, dropout 0.1 of neurons from the layer
classifier.add(Dropout(p=0.1))
#adding output layer
# since output is 0/1 we have only one output node and therefore choose sigmoid activation, for more than that use softmax method
classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

# compile the ANN, use stochastic  gradient descent with adam method, for loss calculation we choose binary_crossentropy (similar to gradient descent)
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 0)


  import sys
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  app.launch_new_instance()


In [10]:
# scale features
sc = StandardScaler()
X_train = sc.fit_transform(X_train) # when we fit and transform with this data later it will only transform based on min/max values of this
X_test = sc.fit_transform(X_test)

#fitting (training) ANN model with training Set
                                 #mini-batch
classifier.fit(X_train, y_train, batch_size=10, nb_epoch=100)

# predict
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5) # true / false if statement

cm = confusion_matrix(y_test, y_pred)
print(cm)

new_pred = classifier.predict(sc.transform(np.array([[0.0 ,0,600,1,40,3,60000,2,1,1,50000]])))
new_pred = (new_pred > 0.5)


In [None]:
# cross validation of neural network(first we use it just figure bias_variance relation):

# create classifier object and return it:
def build_classifier():
    #initialize Sequential classifier (other model is graph)
    # for hidden layers best activation function is usually Relu
    # for output layer it can be softmax or sigmoid
    classifier = Sequential()

    # add the input layer and first hidden layer
    # output_dim : how many neurons in hidden layer | init : initial weight distribution | input_dim : how many input nodes
    classifier.add(Dense(output_dim=6, init='uniform', activation='relu', input_dim=11))
    # dropout regularization, dropout 0.1 of neurons from the layer
    classifier.add(Dropout(p=0.1))
    # adding second hidden layer
    classifier.add(Dense(output_dim=6, init='uniform', activation='relu'))
    classifier.add(Dropout(p=0.1))
    #adding output layer
    # since output is 0/1 we have only one output node and therefore choose sigmoid activation, for more than that use softmax method
    classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

    # compile the ANN, use stochastic  gradient descent with adam method, for loss calculation we choose binary_crossentropy (similar to gradient descent)
    classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return classifier

# here we create our classifer object
classifier = KerasClassifier(build_fn=build_classifier, batch_size=10, epochs=100)
# here we do the cross validation, cv= number of folds, n_jobs=-1  : use all cpu cores for this task
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10, n_jobs=1)
print(accuracies)

mean = accuracies.mean()
variance = accuracies.std()  # if we have high variance between different K-fold sets its a sign of overfitting in our training set
print(mean, variance)

In [11]:
# Hyperparameter tuning:
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(output_dim=6, init='uniform', activation='relu', input_dim=11))
    classifier.add(Dense(output_dim=6, init='uniform', activation='relu'))
    classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))
    classifier.compile(optimizer= optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn=build_classifier)
# parameters than we want to tune and the values to try
parameters = {'batch_size' : [25, 32], 'epochs' : [100, 500], 'optimizer' : ['adam', 'rmsprop']}

# create the cross validation object and train it with 10 different folds
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=5)
grid_search = grid_search.fit(X_train, y_train)

# find best results
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

In [13]:
print(best_parameters)
print(best_accuracy)