In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv(r"C:\Users\Asus\Downloads\ML DATSETS\BAnk\Churn_Modelling.csv")

# Inspect the data
print(df.head())
print(df.info())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [5]:
# 1. Define Target (y) and Features (X)
y = df['Exited']
# Drop irrelevant columns and the target variable
X = df.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)

# 2. Encode Categorical Features
# 'Geography' (3 values) and 'Gender' (2 values) must be converted to numbers.
# We'll use one-hot encoding for this.
X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)

# After encoding, X will have new columns like 'Geography_Germany', 
# 'Geography_Spain', and 'Gender_Male' (all 0s or 1s).

# 3. Split the data into training and test sets
# We use an 80/20 split. 
# 'stratify=y' ensures the proportion of churners (Exited=1) is the
# same in both the train and test sets, which is crucial for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (8000, 11)
Test set shape: (2000, 11)


In [6]:
# Identify columns that need scaling (all except the one-hot encoded dummies)
# Get a list of the dummy columns
dummy_cols = [col for col in X_train.columns if 'Geography_' in col or 'Gender_' in col]
# Get a list of columns to scale
cols_to_scale = [col for col in X_train.columns if col not in dummy_cols]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Only transform the test data (using the scaler fit on X_train)
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# Display the scaled data
print(X_train.head())

      CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
2151     1.058568  1.715086  0.684723 -1.226059      -0.910256   0.641042   
8392     0.913626 -0.659935 -0.696202  0.413288      -0.910256   0.641042   
5006     1.079274 -0.184931 -1.731895  0.601687       0.808830   0.641042   
4117    -0.929207 -0.184931 -0.005739 -1.226059       0.808830   0.641042   
7182     0.427035  0.955079  0.339492  0.548318       0.808830  -1.559960   

      IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
2151       -1.030206         1.042084              False            False   
8392       -1.030206        -0.623556               True            False   
5006        0.970680         0.308128               True            False   
4117       -1.030206        -0.290199              False            False   
7182        0.970680         0.135042               True            False   

      Gender_Male  
2151         True  
8392         True  
5006        Fa

In [7]:
# Get the number of input features
input_dim = X_train.shape[1] 

# Build the Sequential model
model = keras.Sequential([
    # Input layer
    keras.layers.Input(shape=(input_dim,)),
    
    # First hidden layer
    keras.layers.Dense(12, activation='relu'),
    
    # Second hidden layer
    keras.layers.Dense(8, activation='relu'),
    
    # Output layer
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',                # 'adam' is a good default optimizer
    loss='binary_crossentropy',      # Standard loss function for binary classification
    metrics=['accuracy']             # Metric to monitor
)

# Display the model's architecture
model.summary()

In [8]:
# Train the model
# We use 'validation_split' to automatically set aside a portion of the
# training data (e.g., 10%) to monitor validation loss and check for overfitting.
history = model.fit(
    X_train, 
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

# Evaluate the model on the test set
print("\n--- Model Evaluation ---")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Get detailed predictions
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int) # Convert probabilities to 0 or 1

# Print a detailed classification report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Did not Leave (0)', 'Left (1)']))

Epoch 1/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7867 - loss: 0.5058 - val_accuracy: 0.8100 - val_loss: 0.4469
Epoch 2/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8015 - loss: 0.4526 - val_accuracy: 0.8263 - val_loss: 0.4235
Epoch 3/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8126 - loss: 0.4362 - val_accuracy: 0.8375 - val_loss: 0.4129
Epoch 4/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8179 - loss: 0.4277 - val_accuracy: 0.8375 - val_loss: 0.4073
Epoch 5/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8224 - loss: 0.4206 - val_accuracy: 0.8413 - val_loss: 0.4006
Epoch 6/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8274 - loss: 0.4141 - val_accuracy: 0.8375 - val_loss: 0.3943
Epoch 7/100
[1m225/22