## ANN Implementation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import regularizers
import datetime
import numpy as np
import pickle

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## Load Data sets
data = pd.read_excel("Churn_Modelling_sample.csv.xlsx")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]


## Preprocess the data
# Drop the irrelevant columns
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

# Fix: Check if Gender column exists and encode properly
if "Gender" in data.columns:
    label_encoder_gender = LabelEncoder()
    data["Gender_encoded"] = label_encoder_gender.fit_transform(data["Gender"])
    data = data.drop("Gender", axis=1)
else:
    print("Gender column not found, skipping encoding")

# OneHot encoder for Geography
if "Geography" in data.columns:
    onehot_encoder_geo = OneHotEncoder(sparse_output=False)  # Fix: sparse=False deprecated
    geo_encoded = onehot_encoder_geo.fit_transform(data[["Geography"]])
    geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(["Geography"]))
    data = pd.concat([data.drop("Geography", axis=1), geo_encoded_df], axis=1)

# Save the encoders
with open("label_encoder_gender.pkl", "wb") as file:
    pickle.dump(label_encoder_gender, file)

with open("onehot_encoder_geo.pkl", "wb") as file:  # Fix: correct filename
    pickle.dump(onehot_encoder_geo, file)

# Divide the datasets into dependent and independent features
X = data.drop("Exited", axis=1)  ## removed this column because no need of scaling
y = data["Exited"] ## to avoid scaling

# Remove any remaining non-numeric columns
X = X.select_dtypes(include=[np.number])

print(f"Feature columns: {X.columns.tolist()}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

## Build the ANN model with better architecture
model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy", "precision", "recall"]
)

# Print model summary
model.summary()

## Setup TensorBoard with proper logging
log_dir = "logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(
    log_dir=log_dir, 
    histogram_freq=1,  # Log histograms every epoch
    write_graph=True,
    write_images=True,
    update_freq='epoch'
)

## Set up Early Stopping
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    patience=10,  # Increased patience
    min_delta=0.001,
    restore_best_weights=True,
    verbose=1
)

# Train the model
print("Starting training...")
history = model.fit(
    X_train_scaled, 
    y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,  # Increased epochs
    batch_size=32,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1
)

# Save the model
model.save('churn_model.h5')  ## in Keras way
model.summary()



Feature columns: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Gender_encoded', 'Geography_France', 'Geography_Spain']
X shape: (5, 11), y shape: (5,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Starting training...
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.7500 - loss: 0.5783 - precision: 1.0000 - recall: 0.6667 - val_accuracy: 0.0000e+00 - val_loss: 0.6995 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.5000 - loss: 0.7688 - precision: 0.6667 - recall: 0.6667 - val_accuracy: 1.0000 - val_loss: 0.6927 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.7500 - loss: 0.5517 - precision: 0.7500 - recall: 1.0000 - val_accuracy: 1.0000 - val_loss: 0.6831 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 0.5000 - loss: 0.5145 - precision: 0.6667 - recall: 0.6667 - val_accuracy: 1.0000 - val_loss: 0.6741 - val_precision: 1.0000 - val_recall: 1.000



In [2]:
X_train_scaled[:,0].mean()

np.float64(0.0)

In [4]:
X_train_scaled[:,0].std()

np.float64(1.0)

In [6]:
%load_ext tensorboard
%tensorboard --logdir logs/fit20250815-110854

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 15088), started 0:00:03 ago. (Use '!kill 15088' to kill it.)

In [15]:
X_train_scaled.shape

(4, 11)

In [16]:
X_train.shape

(4, 11)