# Preprocessing

In [103]:
import pandas as pd
import numpy as np
import pickle
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("Churn_Modelling.csv",sep=",",header=0)

In [3]:
cleaned_df = df.drop(["CustomerId","RowNumber","Surname"],axis=1)

In [4]:
cleaned_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
cleaned_df["Geography"]

0        France
1         Spain
2        France
3        France
4         Spain
         ...   
9995     France
9996     France
9997     France
9998    Germany
9999     France
Name: Geography, Length: 10000, dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cleaned_df["Gender"] = le.fit_transform(cleaned_df['Gender'])

In [7]:
cleaned_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [8]:
with open("label_encoding_gender.pkl","wb") as file:
    pickle.dump(le,file)

In [9]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
geo_data = ohe.fit_transform(cleaned_df[["Geography"]])

In [10]:
with open("one_hot_encoding_geography.pkl","wb") as file:
    pickle.dump(ohe,file)

In [11]:
geo_data[0:5]

<5x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [12]:
geo_columns = ohe.get_feature_names_out(["Geography"])

In [13]:
geography_df = pd.DataFrame(geo_data.toarray(), columns=geo_columns)

In [14]:
final_df = pd.concat([geography_df,cleaned_df.drop(["Geography"],axis=1)],axis=1)

In [15]:
x = final_df.drop(["Exited"],axis=1)
y = final_df["Exited"]

In [16]:
x.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1


In [17]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.8,random_state=10)

In [54]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

In [20]:
print(x_train[0:2],"\n\n",x_test[0:2])

[[ 0.99104014 -0.56734211 -0.58042949  1.63085958 -1.10442556 -1.38800171
  -0.35992997 -1.21559357  0.82307335  0.65309534 -0.99401789  1.62010109]
 [-1.00904087 -0.56734211  1.72286214  0.04489734  0.90544807  2.65961102
  -0.01116062  0.70906699  0.82307335  0.65309534  1.00601811  0.11092199]] 

 [[-1.00125078  1.7194414  -0.57215401  2.06536225 -1.09388997  0.48162043
  -0.00276035  0.83149559 -0.91161489 -1.55196866  0.9614909   1.23683344]
 [-1.00125078 -0.58158423  1.74778116  0.91916214 -1.09388997  0.09893591
   1.03237274 -1.22843833 -0.91161489  0.64434291 -1.04005144  1.16629354]]


In [None]:
with open("standar_scalar.pkl","wb") as file:
    pickle.dump(ss,file)

# Nueral Network

In [106]:
import torch
import torch.nn as nn
from torch import optim

In [159]:
class NNet(nn.Module):

    def __init__(self,input_dim,output_dim,num_hidden_layers, neurons_per_layer, dropout_rate):
        
        super().__init__()

        layers = []

        for i in range(num_hidden_layers):

            layers.append(nn.Linear(input_dim, neurons_per_layer))
            layers.append(nn.BatchNorm1d(neurons_per_layer))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            input_dim = neurons_per_layer

        layers.append(nn.Linear(neurons_per_layer, output_dim))
        layers.append(nn.Sigmoid())
        
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)



In [160]:
y_train  = np.array(y_train).reshape(-1,1)
y_test =   np.array(y_test).reshape(-1,1)

In [161]:
print(x_train.shape,type(x_train))
print(x_test.shape,type(x_test))
print(y_train.shape,type(y_train))
print(y_test.shape,type(y_test))

torch.Size([2000, 12]) <class 'torch.Tensor'>
torch.Size([8000, 12]) <class 'torch.Tensor'>
(2000, 1) <class 'numpy.ndarray'>
(8000, 1) <class 'numpy.ndarray'>


In [162]:
def objective(trial):
    num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 5)
    epochs = trial.suggest_int("epochs", 10, 50, step=10)
    neurons_per_layer = trial.suggest_int("neurons_per_layer", 8, 128, step=8)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
    optimizer_name = trial.suggest_categorical("optimizer", ['Adam', 'SGD', 'RMSprop'])
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)

    input_dim = 12
    output_dim = 1

    model = NNet(input_dim, output_dim, num_hidden_layers, neurons_per_layer, dropout_rate)
    device = torch.device("cpu")
    model.to(device)

    # Loss function (Binary Cross Entropy)
    criterion = nn.BCELoss()

    # Optimizer selection
    optimizer = {
        'Adam': optim.Adam,
        'SGD': optim.SGD,
        'RMSprop': optim.RMSprop
    }[optimizer_name](model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Convert data to PyTorch tensors
    x_train_torch = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_torch = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
    x_test_torch = torch.tensor(x_test, dtype=torch.float32).to(device)
    y_test_torch = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        outputs = model(x_train_torch)
        loss = criterion(outputs, y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(x_test_torch)
        predictions = (outputs >= 0.5).float()  # Convert to binary predictions (0 or 1)
        correct = (predictions == y_test_torch).sum().item()
        total = y_test_torch.size(0)

    accuracy = correct / total
    return accuracy


In [163]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-03-01 22:55:27,886] A new study created in memory with name: no-name-ee566231-13be-4562-a588-ddb656d0d563
  x_train_torch = torch.tensor(x_train, dtype=torch.float32).to(device)
  x_test_torch = torch.tensor(x_test, dtype=torch.float32).to(device)
[I 2025-03-01 22:55:28,475] Trial 0 finished with value: 0.6875 and parameters: {'num_hidden_layers': 5, 'epochs': 50, 'neurons_per_layer': 128, 'learning_rate': 0.0019303281195462644, 'dropout_rate': 0.5, 'optimizer': 'SGD', 'weight_decay': 3.438467365234484e-05}. Best is trial 0 with value: 0.6875.
[I 2025-03-01 22:55:28,575] Trial 1 finished with value: 0.794625 and parameters: {'num_hidden_layers': 5, 'epochs': 10, 'neurons_per_layer': 104, 'learning_rate': 5.5877026234150534e-05, 'dropout_rate': 0.1, 'optimizer': 'RMSprop', 'weight_decay': 0.0003001907404256757}. Best is trial 1 with value: 0.794625.
[I 2025-03-01 22:55:28,678] Trial 2 finished with value: 0.77575 and parameters: {'num_hidden_layers': 3, 'epochs': 20, 'neurons_pe

In [165]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'num_hidden_layers': 2, 'epochs': 50, 'neurons_per_layer': 32, 'learning_rate': 0.005953027573821317, 'dropout_rate': 0.5, 'optimizer': 'RMSprop', 'weight_decay': 0.00012560049321748383}
Best trial accuracy: 0.855375


In [166]:
model = NNet(12,1,2,32,0.5)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.005)

In [167]:
from torch.utils.tensorboard import SummaryWriter
import datetime

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

writer = SummaryWriter(log_dir)

In [168]:
from torch import tensor


x_train = tensor(x_train,dtype=torch.float32)
y_train = tensor(y_train,dtype=torch.float32)
x_test = tensor(x_test,dtype=torch.float32)
y_test = tensor(y_test,dtype=torch.float32)


  x_train = tensor(x_train,dtype=torch.float32)
  x_test = tensor(x_test,dtype=torch.float32)


In [169]:
epochs = 50
patience = 5
best_loss = float('inf')

for epoch in range(epochs):
    optimizer.zero_grad()

    output = model(x_train)
    loss = criterion(output,y_train)

    predictions = output.round()
    accuracy = (predictions == y_train).float().mean().item() * 100

    loss.backward()
    optimizer.step()

    writer.add_scalar("Loss/train", loss.item(), epoch)
    writer.add_scalar("Accuracy/train",accuracy, epoch)

    if loss.item() < best_loss:
        best_loss = loss.item()
        counter = 0
    else:
        counter+=1
    
    if counter >= patience:
        print(f"Early stopping at epoch {epoch+1} with best accuracy {accuracy:.4f}")
        break

    if epoch % 5 == 0:
        print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

writer.close()

Epoch 1: Loss = 0.6898
Epoch 6: Loss = 0.4428
Epoch 11: Loss = 0.4161
Epoch 16: Loss = 0.4074
Epoch 21: Loss = 0.3921
Epoch 26: Loss = 0.3968
Epoch 31: Loss = 0.3861
Epoch 36: Loss = 0.3675
Epoch 41: Loss = 0.3781
Early stopping at epoch 45 with best accuracy 83.7500


In [175]:
y_pred = model(x_test).round()

In [178]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_pred.detach().numpy(),y_test))

0.827


In [179]:
torch.save(model.state_dict(), "model_complete.pth")