In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [2]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data.drop(['RowNumber', 'CustomerId','Surname'], axis = 1, inplace = True)

In [4]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [5]:
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

In [6]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [7]:
type(data['Geography'])   # Series

pandas.core.series.Series

In [8]:
type(data[['Geography']])  # DataFrame

pandas.core.frame.DataFrame

In [9]:
# onehot_encoder = OneHotEncoder()
# geo_encoder = onehot_encoder.fit_transform(data['Geography']).toarray()   # Many scikit-learn methods (e.g., OneHotEncoder, StandardScaler) expect 2D input, so they need a DataFrame or 2D array:

In [10]:
onehot_encoder_geo = OneHotEncoder()
geo_encoder = onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
geo_encoder


array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], shape=(10000, 3))

In [11]:
onehot_encoder_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [12]:
geo_encoded_df = pd.DataFrame(geo_encoder, columns = onehot_encoder_geo.get_feature_names_out(['Geography']), index = data.index)   # index = data.index to align with original data
geo_encoded_df    

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [13]:
data['Geography']

0        France
1         Spain
2        France
3        France
4         Spain
         ...   
9995     France
9996     France
9997     France
9998    Germany
9999     France
Name: Geography, Length: 10000, dtype: object

In [14]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [15]:
data = pd.concat([data.drop(['Geography'], axis = 1), geo_encoded_df], axis = 1)

In [16]:
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [17]:
## Save the encoders and scaler
import pickle
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)


In [18]:
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [19]:
X = data.drop('Exited', axis = 1)
y = data['Exited']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [21]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((8000, 12), (2000, 12), (8000,), (2000,))

In [22]:
x_train

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
9254,686,1,32,6,0.00,2,1,1,179093.26,1.0,0.0,0.0
1561,632,1,42,4,119624.60,2,1,1,195978.86,0.0,1.0,0.0
1670,559,1,24,3,114739.92,1,1,0,85891.02,0.0,0.0,1.0
6087,561,0,27,9,135637.00,1,1,0,153080.40,1.0,0.0,0.0
6669,517,1,56,9,142147.32,1,0,0,39488.04,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,768,1,54,8,69712.74,1,1,1,69381.05,1.0,0.0,0.0
5191,682,0,58,1,0.00,1,1,1,706.50,1.0,0.0,0.0
5390,735,0,38,1,0.00,3,0,0,92220.12,1.0,0.0,0.0
860,667,1,43,8,190227.46,1,1,0,97508.04,1.0,0.0,0.0


In [23]:
## Scale these features
scaler=StandardScaler()
X_train=scaler.fit_transform(x_train)
X_test=scaler.transform(x_test)

In [24]:
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]], shape=(8000, 12))

In [25]:
X_test

array([[-0.57749609,  0.91324755, -0.6557859 , ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.29729735,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.52560743, -1.09499335,  0.48508334, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.81311987, -1.09499335,  0.77030065, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.41876609,  0.91324755, -0.94100321, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.24540869,  0.91324755,  0.00972116, ..., -0.99850112,
         1.72572313, -0.57638802]], shape=(2000, 12))

In [26]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

In [27]:
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


### ANN Implementation

In [28]:
import torch 
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset


In [29]:
class ChurnDataset(Dataset):
    def __init__(self, features, labels):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1)  # Ensure y is of shape (n_samples, 1)



In [30]:
class ChurnDataset(Dataset):
    def __init__(self,features,labels):
        self.features = torch.tensor(features,dtype=torch.float32)
        self.labels = torch.tensor(labels.values,dtype=torch.long)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self,idx):
        return self.features[idx], self.labels[idx]

In [31]:
train_dataset = ChurnDataset(X_train, y_train)
test_dataset = ChurnDataset(X_test, y_test)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [33]:
class Model(nn.Module):
    def __init__(self , num_features):
        super(Model, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, features):
        return self.network(features)

In [34]:
x_train.shape

(8000, 12)

In [35]:
model = Model(num_features=X_train.shape[1])

In [36]:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 100

In [37]:
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for features, labels in train_loader:
        outputs = model(features)
        loss_value = loss(outputs, labels.float().unsqueeze(1) )
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        running_loss += loss_value.item() * features.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')

Epoch [1/100], Loss: 0.4082
Epoch [2/100], Loss: 0.3574
Epoch [3/100], Loss: 0.3474
Epoch [4/100], Loss: 0.3462
Epoch [5/100], Loss: 0.3426
Epoch [6/100], Loss: 0.3380
Epoch [7/100], Loss: 0.3376
Epoch [8/100], Loss: 0.3348
Epoch [9/100], Loss: 0.3336
Epoch [10/100], Loss: 0.3307
Epoch [11/100], Loss: 0.3311
Epoch [12/100], Loss: 0.3277
Epoch [13/100], Loss: 0.3256
Epoch [14/100], Loss: 0.3245
Epoch [15/100], Loss: 0.3226
Epoch [16/100], Loss: 0.3211
Epoch [17/100], Loss: 0.3201
Epoch [18/100], Loss: 0.3195
Epoch [19/100], Loss: 0.3156
Epoch [20/100], Loss: 0.3163
Epoch [21/100], Loss: 0.3137
Epoch [22/100], Loss: 0.3113
Epoch [23/100], Loss: 0.3109
Epoch [24/100], Loss: 0.3065
Epoch [25/100], Loss: 0.3067
Epoch [26/100], Loss: 0.3046
Epoch [27/100], Loss: 0.3009
Epoch [28/100], Loss: 0.3003
Epoch [29/100], Loss: 0.2979
Epoch [30/100], Loss: 0.2964
Epoch [31/100], Loss: 0.2958
Epoch [32/100], Loss: 0.2919
Epoch [33/100], Loss: 0.2910
Epoch [34/100], Loss: 0.2878
Epoch [35/100], Loss: 0

## Notes: How Loss is Calculated Per Epoch in PyTorch

### 1) Loss Function Output

Most PyTorch loss functions (like `MSELoss`, `BCELoss`, `CrossEntropyLoss`) return the **mean loss over the batch** by default.

If a batch contains \(B\) samples and per-sample losses are \(L_1, L_2, \dots, L_B\):

$$
\text{Batch Loss} = \frac{1}{B} \sum_{i=1}^{B} L_i
$$

Or for batch \(k\) with size \(B_k\):

$$
L_k = \frac{1}{B_k} \sum_{i=1}^{B_k} L_{k,i}
$$

In code:

```python
loss_value = loss(outputs, labels)
````

---

### 2) Why Multiply Loss by Batch Size?

Since `loss_value` is **already averaged**, to accumulate the correct total loss across the epoch, multiply by the batch size:

$$
L_\text{batch}^{\text{total}} = L_\text{batch}^{\text{mean}} \times B
$$

In PyTorch:

```python
running_loss += loss_value.item() * features.size(0)
```

Here, `features.size(0)` is the number of samples in the current batch.

---

### 3) Final Epoch Loss Calculation

After all batches are processed:

* Total dataset size = (N)
* Total accumulated loss:

$$
L_\text{total} = \sum_{i=1}^{N} L_i
$$

* Epoch average loss:

$$
L_\text{epoch} = \frac{L_\text{total}}{N}
$$

In code:

```python
epoch_loss = running_loss / len(train_loader.dataset)
```

---

### 4) Full Mathematical Justification

If the dataset is divided into (K) batches of sizes (B_1, B_2, \dots, B_K):

1. Loss returned by PyTorch for batch (k):

$$
L_k = \frac{1}{B_k} \sum_{i=1}^{B_k} L_{k,i}
$$

2. Compute **total loss across all batches**:

$$
L_\text{total} = \sum_{k=1}^{K} L_k \times B_k
$$

3. Final **epoch average loss**:

$$
L_\text{epoch} = \frac{L_\text{total}}{\sum_{k=1}^{K} B_k}
$$

Since:

$$
\sum_{k=1}^{K} B_k = N \quad \text{(total dataset size)}
$$

````



## Testing


In [38]:
x_test

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
6252,596,1,32,3,96709.07,2,0,0,41788.37,0.0,1.0,0.0
4684,623,1,43,1,0.00,2,1,1,146379.30,1.0,0.0,0.0
1731,601,0,44,4,0.00,2,1,0,58561.31,0.0,0.0,1.0
4742,506,1,59,8,119152.10,2,1,1,170679.74,0.0,1.0,0.0
4521,560,0,27,7,124995.98,1,1,1,114669.79,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6412,602,0,53,5,98268.84,1,0,1,45038.29,0.0,1.0,0.0
8285,609,1,25,10,0.00,1,0,1,109895.16,1.0,0.0,0.0
7853,730,0,47,7,0.00,1,1,0,33373.26,1.0,0.0,0.0
1095,692,1,29,4,0.00,1,1,0,76755.99,1.0,0.0,0.0


In [72]:
# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for features, labels in test_loader:
        outputs = model(features)
        # print(outputs.data)
        predicted = (outputs > 0.5).float()
        # print(labels.shape[0])
        total += labels.shape[0]
        correct += (predicted.squeeze() == labels.float()).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test set: {accuracy:.2f}%')

Accuracy of the model on the test set: 84.10%


In [None]:
a =  torch.tensor([[0.5, -1.2, 0.3, 0.8, -0.5, 1.0, 0.0, 0.2, -0.1, 0.4]], dtype=torch.float32)
a>0.5

In [70]:
a.squeeze()

tensor([ 0.5000, -1.2000,  0.3000,  0.8000, -0.5000,  1.0000,  0.0000,  0.2000,
        -0.1000,  0.4000])

In [73]:
# Saving the model -->  saves the learned weights and biases of the model only.
torch.save(model.state_dict(), "model.pth")


In [77]:
onehot_encoder_geo

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [80]:
onehot_encoder_geo.categories_[0]

array(['France', 'Germany', 'Spain'], dtype=object)

In [81]:
label_encoder_gender

In [82]:
label_encoder_gender.classes_

array(['Female', 'Male'], dtype=object)

In [87]:
pd.DataFrame({
    "A":[1],
    "b":[2],
    "c":[3],
    "d":[4]
})

Unnamed: 0,A,b,c,d
0,1,2,3,4
