In [26]:
import pandas as pd
import numpy as np

In [27]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# Importing pickle for serializing and de-serializing Python object structures
import pickle

In [28]:
# Replace 'your_file.csv' with the actual file name
df = pd.read_csv('Churn_Modelling.csv')

In [29]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [30]:
# Display basic information about the dataframe
df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [31]:
# Display summary statistics of the dataframe
df_description = df.describe()
df_description

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [32]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [33]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the Gender column
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Display the first few rows to verify the encoding
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [34]:
df['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [35]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

### One Hot Encoding

One hot encoding is a technique used to convert categorical data into a format that can be provided to machine learning algorithms to improve predictions. This technique is particularly useful when dealing with categorical variables that have no ordinal relationship.

In one hot encoding, each category is converted into a new binary column. Each column corresponds to one possible category, and the value is 1 if the original category matches the column, and 0 otherwise.

For example, consider a categorical variable "Geography" with three possible values: "France", "Spain", and "Germany". One hot encoding will convert this variable into three binary columns:

| Geography_France | Geography_Spain | Geography_Germany |
|------------------|-----------------|-------------------|
| 1                | 0               | 0                 |
| 0                | 1               | 0                 |
| 1                | 0               | 0                 |
| 0                | 0               | 1                 |

This transformation allows the machine learning algorithm to understand the categorical data without assuming any ordinal relationship between the categories.

In [36]:
one_hot_encoder = OneHotEncoder()

In [38]:
geo_encoded = one_hot_encoder.fit_transform(df['Geography'].values.reshape(-1, 1)).toarray()
"""
Encodes the 'Geography' column of the DataFrame using one-hot encoding.

The 'Geography' column is reshaped and transformed into a one-hot encoded array.

Note:
    Ensure that 'one_hot_encoder' is an instance of a one-hot encoder (e.g., OneHotEncoder from sklearn) 
    and 'df' is a pandas DataFrame containing the 'Geography' column.

Returns:
    numpy.ndarray: A 2D array with one-hot encoded values of the 'Geography' column.
"""

"\nEncodes the 'Geography' column of the DataFrame using one-hot encoding.\n\nThe 'Geography' column is reshaped and transformed into a one-hot encoded array.\n\nNote:\n    Ensure that 'one_hot_encoder' is an instance of a one-hot encoder (e.g., OneHotEncoder from sklearn) \n    and 'df' is a pandas DataFrame containing the 'Geography' column.\n\nReturns:\n    numpy.ndarray: A 2D array with one-hot encoded values of the 'Geography' column.\n"

In [39]:
geo_encoded

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [42]:
one_hot_encoder.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [43]:
geo_encoded_df = pd.DataFrame(geo_encoded, columns=one_hot_encoder.get_feature_names_out(['Geography']))
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [44]:
geo_encoded_df.shape

(10000, 3)

In [45]:
df = pd.concat([df, geo_encoded_df], axis=1)

In [46]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,France,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,France,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [47]:
df.drop('Geography', axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [48]:
df.dtypes

CreditScore            int64
Gender                 int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_France     float64
Geography_Germany    float64
Geography_Spain      float64
dtype: object

In [50]:
# Save the LabelEncoder for Gender
with open('gender_label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

# Save the OneHotEncoder for Geography
with open('geography_one_hot_encoder.pkl', 'wb') as file:
    pickle.dump(one_hot_encoder, file)

In [51]:
# Split the data into features and target
X = df.drop('Exited', axis=1)
y = df['Exited']

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Why Scaling is Done

Scaling is an essential preprocessing step in many machine learning algorithms. The primary reasons for scaling are:

1. **Improving Convergence Speed**: Many optimization algorithms, such as gradient descent, converge faster when the features are on a similar scale. If the features have vastly different scales, the algorithm may take longer to find the optimal solution.

2. **Avoiding Dominance of Features**: Features with larger scales can dominate the learning process, leading to biased models. Scaling ensures that each feature contributes equally to the model.

3. **Enhancing Model Performance**: Some machine learning algorithms, such as Support Vector Machines (SVM) and K-Nearest Neighbors (KNN), are sensitive to the scale of the data. Scaling can improve the performance and accuracy of these models.

4. **Ensuring Consistency**: Scaling ensures that the model treats all features consistently, which is particularly important when combining features with different units or magnitudes.

Common scaling techniques include:
- **Standardization**: Subtracting the mean and dividing by the standard deviation, resulting in features with zero mean and unit variance.
- **Normalization**: Scaling the features to a fixed range, typically [0, 1] or [-1, 1].

In this notebook, we used `StandardScaler` from `sklearn.preprocessing` to standardize the features, ensuring that they have a mean of 0 and a standard deviation of 1.

In [53]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [55]:
# Save the scaler in pickle file
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [56]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


### Artificial Neural Networks (ANN)

Artificial Neural Networks (ANN) are computational models inspired by the human brain. They are designed to recognize patterns and make decisions based on data. ANNs consist of interconnected layers of nodes (neurons), where each connection has an associated weight.

#### Key Components of ANN:

1. **Input Layer**: 
    - The input layer receives the input data. Each neuron in this layer represents a feature in the dataset.

2. **Hidden Layers**: 
    - Hidden layers are intermediate layers between the input and output layers. They perform computations and extract features from the input data. An ANN can have one or more hidden layers.

3. **Output Layer**: 
    - The output layer produces the final output of the network. The number of neurons in this layer depends on the type of task (e.g., binary classification, multi-class classification, regression).

4. **Weights and Biases**: 
    - Weights are parameters that determine the strength of the connection between neurons. Biases are additional parameters that allow the model to fit the data better.

5. **Activation Functions**: 
    - Activation functions introduce non-linearity into the network, enabling it to learn complex patterns. Common activation functions include ReLU (Rectified Linear Unit), Sigmoid, and Tanh.

6. **Loss Function**: 
    - The loss function measures the difference between the predicted output and the actual output. The goal of training is to minimize this loss. Common loss functions include Mean Squared Error (MSE) for regression and Cross-Entropy Loss for classification.

7. **Optimization Algorithm**: 
    - Optimization algorithms, such as Gradient Descent, are used to update the weights and biases to minimize the loss function. Variants like Stochastic Gradient Descent (SGD) and Adam are commonly used.

#### Training Process:

1. **Forward Propagation**: 
    - Input data is passed through the network, and the output is computed.

2. **Loss Calculation**: 
    - The loss function calculates the error between the predicted output and the actual output.

3. **Backward Propagation**: 
    - The error is propagated back through the network, and the gradients of the loss function with respect to the weights and biases are computed.

4. **Weight Update**: 
    - The weights and biases are updated using the optimization algorithm to minimize the loss.

5. **Iteration**: 
    - Steps 1-4 are repeated for a specified number of epochs or until the loss converges.

#### Applications of ANN:

- Image and Speech Recognition
- Natural Language Processing (NLP)
- Financial Forecasting
- Medical Diagnosis
- Autonomous Vehicles

ANNs are powerful tools for solving complex problems, and their performance can be enhanced by using techniques like regularization, dropout, and batch normalization.

In [58]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [59]:
# Build the ANN model
model = Sequential(
    [
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Hidden layer 1 connected to input layer
        Dense(32, activation='relu'), # Hidden layer 2 
        Dense(1, activation='sigmoid') # Output layer
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [60]:
# Summary of the model
model.summary()

In [61]:
# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

In [62]:
# Loss function
loss = tf.keras.losses.BinaryCrossentropy()

In [63]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [64]:
# Setup TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [65]:
# Callbacks
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [66]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [67]:
# Train the model
history = model.fit(
    X_train, y_train, validation_data=(X_test, y_test), 
    epochs=100, 
    callbacks=[tensorboard_callback, early_stopping]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8143 - loss: 0.4245 - val_accuracy: 0.8485 - val_loss: 0.3645
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8578 - loss: 0.3498 - val_accuracy: 0.8505 - val_loss: 0.3529
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8623 - loss: 0.3352 - val_accuracy: 0.8675 - val_loss: 0.3445
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8611 - loss: 0.3421 - val_accuracy: 0.8545 - val_loss: 0.3400
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8553 - loss: 0.3408 - val_accuracy: 0.8620 - val_loss: 0.3332
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8608 - loss: 0.3355 - val_accuracy: 0.8600 - val_loss: 0.3464
Epoch 7/100
[1m250/25

In [68]:
# Save the model
model.save('customer_churn_model.h5')



In [69]:
# Load TensorBoard extension
%load_ext tensorboard

In [73]:
# Start TensorBoard
%tensorboard --logdir logs/fit/20250308-220917

Reusing TensorBoard on port 6007 (pid 40552), started 0:06:20 ago. (Use '!kill 40552' to kill it.)