In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [2]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
#drop irrelevant columns
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#converting categorical variables to numerical using label encoding
label_encoder_gender = LabelEncoder()
data['Gender']= label_encoder_gender.fit_transform(data['Gender'])
data.head()


#here all female become 0 and all male become 1

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


**why we are applying this on geography column as well?**
->  because it is also a categorical variable and we need to convert it into numerical format for our model to understand it. We can use one-hot encoding for this column as it has more than two categories.

In [5]:
#one hot encoding for geography column
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder_geography = OneHotEncoder()
geo_encoder = one_hot_encoder_geography.fit_transform(data[['Geography']])
geo_encoder


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [6]:
geo_encoder.toarray()
# if we dont convert it to array it will be in sparse matrix format which is not suitable for our model 
# and it will be in compressed format which is not suitable for our model 10000 to 1 columns will be in sparse format and it will be in compressed format which is not suitable for our model

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [7]:
#check for with feature we perform one hot encoding
one_hot_encoder_geography.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [8]:
geo_encode_df = pd.DataFrame(geo_encoder.toarray(), columns=one_hot_encoder_geography.get_feature_names_out(['Geography']))
geo_encode_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [9]:
#combine the original data with the one hot encoded data
data = pd.concat([data.drop('Geography', axis=1), geo_encode_df], axis=1)
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [10]:
#save encoder to use it in future when we will deploy our model
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)
with open('one_hot_encoder_geography.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder_geography, f)

In [11]:
#divide data into dependent and independent variables
X = data.drop('Exited', axis=1) #independent variables
y = data['Exited'] #dependent variable

#split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
X_train_scaled

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [13]:
print(X_train.columns)

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France',
       'Geography_Germany', 'Geography_Spain'],
      dtype='object')


In [14]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# ANN Implementation

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime


In [16]:
X_train.shape[1]

12

In [17]:
#building ann model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)), #1st hidden layer with 16 neurons and relu activation function and connected to input layer with 12 features
    Dense(32, activation='relu'), #2nd hidden layer with 32 neurons and relu activation function
    Dense(1, activation='sigmoid') #output layer with 1 neuron and sigmoid activation function for binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
model.summary()

learning rate = fundametal hyperparameter that control how much models internal parameter(weights)
are adjusted with respect to loss gradient during training of the model. it helps to determine the steps taken to minimize the loss function values ranges from 0.1 to 0.00001

also their are various optimizer present like adam, adamW, adammax etccc

In [19]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy()

''' 
    optimizer: Adam is an optimization algorithm that can be used instead of the classical stochastic gradient descent procedure to update network weights iteratively based on training data. It combines the advantages of two other extensions of stochastic gradient descent, namely Adaptive Gradient Algorithm (AdaGrad) and Root Mean Square Propagation (RMSProp). Adam computes adaptive learning rates for each parameter.


    loss: Binary Crossentropy is a loss function commonly used in binary classification problems. It measures the difference between the predicted probabilities and the actual binary labels. The goal of training a model using binary crossentropy is to minimize this loss, which indicates how well the model's predictions match the true labels.

    for multiclass classification problems, you would typically use Categorical Crossentropy as the loss function instead of Binary Crossentropy.
'''

" \n    optimizer: Adam is an optimization algorithm that can be used instead of the classical stochastic gradient descent procedure to update network weights iteratively based on training data. It combines the advantages of two other extensions of stochastic gradient descent, namely Adaptive Gradient Algorithm (AdaGrad) and Root Mean Square Propagation (RMSProp). Adam computes adaptive learning rates for each parameter.\n\n\n    loss: Binary Crossentropy is a loss function commonly used in binary classification problems. It measures the difference between the predicted probabilities and the actual binary labels. The goal of training a model using binary crossentropy is to minimize this loss, which indicates how well the model's predictions match the true labels.\n\n    for multiclass classification problems, you would typically use Categorical Crossentropy as the loss function instead of Binary Crossentropy.\n"

In [20]:
#compile the model
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [21]:
#set up the tensorboard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

''' 
    Tenssorborad is just to visualize the logs of the training process, it helps us to see how the loss and accuracy are changing over time during training. It also allows us to compare different runs of the model and see which one is performing better. We can also visualize the weights and biases of the model and see how they are changing during training.
'''

' \n    Tenssorborad is just to visualize the logs of the training process, it helps us to see how the loss and accuracy are changing over time during training. It also allows us to compare different runs of the model and see which one is performing better. We can also visualize the weights and biases of the model and see how they are changing during training.\n'

In [22]:

tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [23]:
#setup early stopping 
''' 
    Early stopping is a regularization technique used to prevent overfitting in machine learning models. It works by monitoring the performance of the model on a validation set during training and stopping the training process when the performance starts to degrade. This helps to ensure that the model does not continue to learn from the training data to the point where it starts to memorize it, which can lead to poor generalization on new, unseen data.
'''

earlystopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [24]:
#train the model
history =  model.fit(X_train_scaled , y_train, validation_data=(X_test_scaled, y_test), epochs=50, batch_size=32, callbacks=[earlystopping_callback, tensorflow_callback])

Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8056 - loss: 0.4413 - val_accuracy: 0.8520 - val_loss: 0.3640
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8569 - loss: 0.3597 - val_accuracy: 0.8605 - val_loss: 0.3418
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8638 - loss: 0.3377 - val_accuracy: 0.8575 - val_loss: 0.3488
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8638 - loss: 0.3402 - val_accuracy: 0.8545 - val_loss: 0.3531
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8617 - loss: 0.3399 - val_accuracy: 0.8600 - val_loss: 0.3423
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8624 - loss: 0.3316 - val_accuracy: 0.8535 - val_loss: 0.3460
Epoch 7/50
[1m250/250[0m 

In [25]:
model.save('ann_model.h5')



In [26]:
#load tensorboard extension
%reload_ext tensorboard
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 18612), started 23:17:28 ago. (Use '!kill 18612' to kill it.)

In [27]:
#load pickled model