## IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import torch
from torch import nn
from torch.optim import Adam
from torch import tensor
from ignite.contrib.metrics import *
from sklearn.model_selection import train_test_split
from torchmetrics import *


## DATA PREPROCESSING

In [2]:
raw_df = pd.read_csv('train.csv')

In [3]:
test_df = pd.read_csv('test.csv')

In [4]:
sample_sub_df = pd.read_csv('sample_submission.csv')

In [5]:
raw_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [6]:
test_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


NO IMPUTATION REQUIRED

In [8]:
raw_df.nunique()

id                 165034
CustomerId          23221
Surname              2797
CreditScore           457
Geography               3
Gender                  2
Age                    71
Tenure                 11
Balance             30075
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55298
Exited                  2
dtype: int64

In [9]:
dropped_cols = ['id' , 'CustomerId' , 'Surname']

In [10]:
raw_df.drop(columns=dropped_cols, inplace=True)
test_df.drop(columns=dropped_cols, inplace=True)

In [11]:
raw_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### ONE HOT ENCODING

In [12]:
categorical_cols = [x for x in raw_df.columns if raw_df[x].dtype == 'object']

In [13]:
categorical_cols

['Geography', 'Gender']

In [14]:
encoder = OneHotEncoder(sparse=False)

In [15]:
encoder.fit(raw_df[categorical_cols])



In [16]:
encoded_cols = encoder.get_feature_names_out()
encoded_cols

array(['Geography_France', 'Geography_Germany', 'Geography_Spain',
       'Gender_Female', 'Gender_Male'], dtype=object)

In [17]:
raw_df[encoded_cols] = encoder.transform(raw_df[categorical_cols])
test_df[encoded_cols] = encoder.transform(test_df[categorical_cols])

In [18]:
raw_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0,1.0,0.0,0.0,0.0,1.0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0,1.0,0.0,0.0,0.0,1.0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0,1.0,0.0,0.0,0.0,1.0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1.0,0.0,0.0,0.0,1.0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0,0.0,0.0,1.0,0.0,1.0


In [19]:
test_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75,1.0,0.0,0.0,1.0,0.0
1,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27,1.0,0.0,0.0,1.0,0.0
2,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09,1.0,0.0,0.0,1.0,0.0
3,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57,1.0,0.0,0.0,0.0,1.0
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0,0.0,1.0,0.0,0.0,1.0


In [20]:
raw_df.drop(columns=categorical_cols, inplace=True)
test_df.drop(columns=categorical_cols, inplace=True)

In [21]:
# checking for imbalanced dataset
raw_df['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

NOT IMBALANCED

In [22]:
input_df = raw_df.drop(columns='Exited')
target_df = raw_df['Exited']

In [23]:
input_df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,668,33.0,3,0.00,2,1.0,0.0,181449.97,1.0,0.0,0.0,0.0,1.0
1,627,33.0,1,0.00,2,1.0,1.0,49503.50,1.0,0.0,0.0,0.0,1.0
2,678,40.0,10,0.00,2,1.0,0.0,184866.69,1.0,0.0,0.0,0.0,1.0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,1.0,0.0,0.0,0.0,1.0
4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,667,33.0,2,0.00,1,1.0,1.0,131834.75,0.0,0.0,1.0,1.0,0.0
165030,792,35.0,3,0.00,1,0.0,0.0,131834.45,1.0,0.0,0.0,0.0,1.0
165031,565,31.0,5,0.00,1,1.0,1.0,127429.56,1.0,0.0,0.0,0.0,1.0
165032,554,30.0,7,161533.00,1,0.0,1.0,71173.03,0.0,0.0,1.0,1.0,0.0


### MIN MAX SCALER

In [24]:
scaler = MinMaxScaler()

In [25]:
input_df = scaler.fit_transform(input_df)



In [26]:
input_df

array([[0.636     , 0.2027027 , 0.3       , ..., 0.        , 0.        ,
        1.        ],
       [0.554     , 0.2027027 , 0.1       , ..., 0.        , 0.        ,
        1.        ],
       [0.656     , 0.2972973 , 1.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.43      , 0.17567568, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       [0.408     , 0.16216216, 0.7       , ..., 1.        , 1.        ,
        0.        ],
       [1.        , 0.17567568, 0.1       , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
ones_input_df = raw_df[raw_df['Exited'] == 1]
zeroes_input_df = raw_df[raw_df['Exited'] == 0]
ones_input_df.drop(columns=['Exited'], inplace=True)
zeroes_input_df.drop(columns=['Exited'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ones_input_df.drop(columns=['Exited'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zeroes_input_df.drop(columns=['Exited'], inplace=True)


In [28]:
target_df

0         0
1         0
2         0
3         0
4         0
         ..
165029    0
165030    0
165031    0
165032    0
165033    1
Name: Exited, Length: 165034, dtype: int64

## RANDOM FOREST CLASSIFIER

In [29]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=15, n_jobs=-1, random_state=42, class_weight='balanced')

In [30]:
rfc.fit(input_df, target_df)

In [31]:
rfc_ones_preds = rfc.predict(ones_input_df)
rfc_zeroes_preds = rfc.predict(zeroes_input_df)
rfc_preds = rfc.predict(input_df)



In [32]:
print(accuracy_score(rfc_ones_preds ,[1]*rfc_ones_preds.shape[0]))
print(accuracy_score(rfc_zeroes_preds ,[0]*rfc_zeroes_preds.shape[0]))
print(accuracy_score(rfc_preds ,target_df))

0.6477191374817445
0.5013180850491495
0.8981119042136771


In [33]:

print(roc_auc_score(rfc_preds ,target_df))

0.8381486984554902


In [34]:
rfc_preds = rfc.predict(test_df)



In [35]:
sample_sub_df['Exited'] = rfc_preds

In [36]:
sample_sub_df

Unnamed: 0,id,Exited
0,165034,1
1,165035,1
2,165036,1
3,165037,1
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,1
110021,275055,1


In [37]:
rfc_fp = sample_sub_df.to_csv('rfc.csv',index=False)

## NEURAL NETWORK

In [38]:
X_tensor = torch.tensor(input_df.astype('float32'),requires_grad=True)
Y_tensor = torch.tensor(target_df.astype('float32'), requires_grad=True)

In [39]:
X_tensor

tensor([[0.6360, 0.2027, 0.3000,  ..., 0.0000, 0.0000, 1.0000],
        [0.5540, 0.2027, 0.1000,  ..., 0.0000, 0.0000, 1.0000],
        [0.6560, 0.2973, 1.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.4300, 0.1757, 0.5000,  ..., 0.0000, 0.0000, 1.0000],
        [0.4080, 0.1622, 0.7000,  ..., 1.0000, 1.0000, 0.0000],
        [1.0000, 0.1757, 0.1000,  ..., 0.0000, 0.0000, 1.0000]],
       requires_grad=True)

In [40]:
Y_tensor

tensor([0., 0., 0.,  ..., 0., 0., 1.], requires_grad=True)

In [41]:
print(type(X_tensor))
print(type(Y_tensor))


<class 'torch.Tensor'>
<class 'torch.Tensor'>


### Train Test Split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_tensor, Y_tensor, test_size=0.15, random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)

(140278, 140278, 24756, 24756)

### NN Architecture

In [43]:
class Model(nn.Module):

    def __init__(self, inp, h1, h2, h3, out):
        super(Model, self).__init__()

        self.linear1 = nn.Linear(inp, h1)
        self.Relu1 = nn.ReLU()
        self.linear2 = nn.Linear(h1, h2)
        self.Relu2 = nn.ReLU()
        self.linear3 = nn.Linear(h2, h3)
        self.Relu3 = nn.ReLU()
        self.linear4 = nn.Linear(h3,out)

    def forward(self, x):

        x = self.linear1(x)
        x = self.Relu1(x)
        x = self.linear2(x)
        x = self.Relu2(x)    
        x = self.linear3(x)
        x = self.Relu3(x)    
        x = self.linear4(x)
        
        return x

In [44]:
X_train.shape

torch.Size([140278, 13])

In [45]:
model = Model(13,32,64,20,1)

In [46]:
model.named_parameters

<bound method Module.named_parameters of Model(
  (linear1): Linear(in_features=13, out_features=32, bias=True)
  (Relu1): ReLU()
  (linear2): Linear(in_features=32, out_features=64, bias=True)
  (Relu2): ReLU()
  (linear3): Linear(in_features=64, out_features=20, bias=True)
  (Relu3): ReLU()
  (linear4): Linear(in_features=20, out_features=1, bias=True)
)>

In [47]:
loss_fxn = nn.BCEWithLogitsLoss()
optimizer = Adam(params=model.parameters(), lr=0.002)

In [48]:
loss_fxn

BCEWithLogitsLoss()

In [49]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.002
    maximize: False
    weight_decay: 0
)

### Accuracy Function

In [50]:
auc = AUROC(task="binary")

In [51]:
def accuracy_fxn(y_pred, y_true):
    s = auc(y_true, y_pred)*100
    return s


### Training and Testing NN

In [52]:
y_train = y_train.reshape((len(X_train),1))
y_test = y_test.reshape((len(X_test),1))

In [53]:
y_train.shape

torch.Size([140278, 1])

In [54]:
epochs = 101

In [68]:
for epoch in range(epochs):

    # TRAINING PHASE
    model.train()

    # forward pass
    yout = model(X_train)
    y_preds = torch.round(torch.sigmoid(yout))
    
    # Calculate loss
    tloss = loss_fxn(yout, y_train)
    acc = accuracy_fxn(y_preds, y_train)

    #optimizer zero grad
    optimizer.zero_grad()

    # differentiation
    tloss.backward(retain_graph=True)

    #optimize
    optimizer.step()

    
    # TESTING
    model.eval()

    with torch.inference_mode():

        # forward pass
        test_out = model(X_test)
        test_preds = torch.round(torch.sigmoid(test_out))

        # Calculate loss
        test_loss = loss_fxn(test_preds, y_test)
        test_acc = accuracy_fxn(test_preds, y_test)

    if epoch % 10 == 0:
        print(f'Epoch: {epoch} | Loss: {tloss:.5f}, Acc: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%')    


Epoch: 0 | Loss: 0.40357, Acc: 78.37% | Test loss: 0.68248, Test acc: 78.68%
Epoch: 10 | Loss: 0.39465, Acc: 77.96% | Test loss: 0.68346, Test acc: 77.81%
Epoch: 20 | Loss: 0.39036, Acc: 77.91% | Test loss: 0.68318, Test acc: 77.94%
Epoch: 30 | Loss: 0.38698, Acc: 78.12% | Test loss: 0.68238, Test acc: 78.30%
Epoch: 40 | Loss: 0.38369, Acc: 78.29% | Test loss: 0.68236, Test acc: 78.31%
Epoch: 50 | Loss: 0.37962, Acc: 78.54% | Test loss: 0.68164, Test acc: 78.62%
Epoch: 60 | Loss: 0.37428, Acc: 78.86% | Test loss: 0.68099, Test acc: 78.89%
Epoch: 70 | Loss: 0.36817, Acc: 78.88% | Test loss: 0.68011, Test acc: 79.32%
Epoch: 80 | Loss: 0.36261, Acc: 78.91% | Test loss: 0.68042, Test acc: 79.10%
Epoch: 90 | Loss: 0.35779, Acc: 79.58% | Test loss: 0.68044, Test acc: 79.07%
Epoch: 100 | Loss: 0.35353, Acc: 79.66% | Test loss: 0.68002, Test acc: 79.24%


In [56]:
y_test

tensor([[0.],
        [0.],
        [1.],
        ...,
        [0.],
        [1.],
        [0.]], grad_fn=<ViewBackward0>)

In [57]:
for name, param in model.named_parameters():
    print("param {}, grad {}".format(name, param.grad))

param linear1.weight, grad tensor([[-1.5739e-04,  1.1613e-03, -3.8501e-05, -1.3129e-04, -3.2124e-05,
          2.3146e-04,  6.0457e-04, -9.1587e-04, -6.5405e-04, -1.1288e-04,
          2.6670e-04, -3.4871e-04, -1.5151e-04],
        [-2.2953e-04, -3.4695e-04, -2.1077e-04,  3.4326e-05,  1.5299e-04,
         -4.8910e-04, -2.2472e-04, -1.3664e-04, -1.6523e-04, -8.0546e-05,
         -1.4214e-04,  2.4578e-05, -4.1250e-04],
        [-4.4174e-04,  1.1190e-03, -2.0528e-04, -2.8062e-04, -1.1066e-04,
          1.2798e-04,  6.4152e-04, -1.2897e-03, -9.9198e-04, -2.6442e-04,
          2.4959e-04, -6.8796e-04, -3.1885e-04],
        [-6.0969e-04,  8.1350e-04, -5.0541e-04, -7.2916e-05,  2.0044e-04,
         -6.5689e-04,  1.4810e-04, -1.2892e-03, -1.1228e-03, -9.8725e-05,
         -1.0239e-06, -6.4960e-04, -5.7298e-04],
        [-1.7969e-03,  2.3948e-03, -9.9559e-04, -1.3803e-03, -8.4366e-04,
         -8.0386e-04,  1.1609e-03, -3.8216e-03, -1.9497e-03, -2.2044e-03,
          6.3419e-04, -1.8774e-03, -1

### Test Predictions

In [58]:
test_df = scaler.fit_transform(test_df)

In [59]:
test_df

array([[0.472     , 0.06756757, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       [0.666     , 0.37837838, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       [0.612     , 0.21621622, 0.7       , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.724     , 0.17567568, 0.2       , ..., 0.        , 0.        ,
        1.        ],
       [0.718     , 0.18918919, 0.3       , ..., 0.        , 1.        ,
        0.        ],
       [0.542     , 0.25675676, 0.7       , ..., 0.        , 1.        ,
        0.        ]])

In [60]:
test_tensor = torch.tensor(test_df.astype('float32'), requires_grad=True)

In [61]:
test_tensor

tensor([[0.4720, 0.0676, 0.2000,  ..., 0.0000, 1.0000, 0.0000],
        [0.6660, 0.3784, 0.2000,  ..., 0.0000, 1.0000, 0.0000],
        [0.6120, 0.2162, 0.7000,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [0.7240, 0.1757, 0.2000,  ..., 0.0000, 0.0000, 1.0000],
        [0.7180, 0.1892, 0.3000,  ..., 0.0000, 1.0000, 0.0000],
        [0.5420, 0.2568, 0.7000,  ..., 0.0000, 1.0000, 0.0000]],
       requires_grad=True)

In [62]:
# PREDICTIONS
model.eval()

with torch.inference_mode():
    test_logits = model(test_tensor)
    test_p = torch.round(torch.sigmoid(test_logits))

In [63]:
test_logits

tensor([[-2.7277],
        [ 0.1842],
        [-1.5249],
        ...,
        [-2.5361],
        [-1.6572],
        [-0.8052]])

In [64]:
test_p

tensor([[0.],
        [1.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [65]:
sample_sub_df['Exited'] = test_p

In [66]:
sample_sub_df

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,1.0
2,165036,0.0
3,165037,0.0
4,165038,0.0
...,...,...
110018,275052,0.0
110019,275053,0.0
110020,275054,0.0
110021,275055,0.0


In [67]:
nn_csv = sample_sub_df.to_csv('nn.csv', index=False)