In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("raw_data.csv")

In [3]:
df = data.copy()

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


**Quick insights**<br>

1. Data do not contain any missing value.
2. Data do not contain any duplicate rows.
3. Data contains a unnecessary feature that is RowNumber.
4. Data contains few feaures that has not any significance on Prediction model that are CustomerName, Surname.
5. In our data, Gender and Geography are of object datatype that must be changed for further modelling.
6. CreditScore, Age contains outliers or noise.
7. EstimatedSalary contain unusual or unrealistic information.
8. There is no feature that can show direct relation with dependent feature.

In [5]:
## Remove outliers
noised_features = ["CreditScore","Age"]

In [6]:
def remove_outliers(df, columns, threshold=1.5):


    for col in columns:
        Q1 = df[col].quantile(0.25)  # First quartile
        Q3 = df[col].quantile(0.75)  # Third quartile
        IQR = Q3 - Q1  # Interquartile range

        # Define lower and upper bounds
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        # Filter the dataset to remove outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

In [7]:
df = remove_outliers(df, noised_features)

In [8]:
## Dropping unnecessary and unusual features
df.drop(["RowNumber","CustomerId","Surname","EstimatedSalary"],axis = 1,inplace= True)

In [9]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,619,France,Female,42,2,0.0,1,1,1,1
1,608,Spain,Female,41,1,83807.86,1,0,1,0
2,502,France,Female,42,8,159660.8,3,1,0,1
3,699,France,Female,39,1,0.0,2,0,0,0
4,850,Spain,Female,43,2,125510.82,1,1,1,0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [11]:
X = df.drop("Exited",axis = 1)
y = df["Exited"]

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state=42)

In [16]:
onehot_encoder = OneHotEncoder(drop = "first",sparse_output=False)

In [17]:
# Reset indexes before encoding
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# Encoding
encoded_X_train = onehot_encoder.fit_transform(X_train[["Geography", "Gender"]])
encoded_X_test = onehot_encoder.transform(X_test[["Geography", "Gender"]])

encoded_X_train = pd.DataFrame(encoded_X_train, columns=onehot_encoder.get_feature_names_out(["Geography", "Gender"]))
encoded_X_test = pd.DataFrame(encoded_X_test, columns=onehot_encoder.get_feature_names_out(["Geography", "Gender"]))

# Reset indexes for encoded data
encoded_X_train = encoded_X_train.reset_index(drop=True)
encoded_X_test = encoded_X_test.reset_index(drop=True)

# Concatenation
x_train = pd.concat([X_train.drop(["Geography", "Gender"], axis=1).reset_index(drop=True), encoded_X_train], axis=1)
x_test = pd.concat([X_test.drop(["Geography", "Gender"], axis=1).reset_index(drop=True), encoded_X_test], axis=1)

In [20]:
x_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Geography_Germany,Geography_Spain,Gender_Male
0,642,41,7,115171.71,1,1,1,0.0,0.0,0.0
1,794,22,4,114440.24,1,1,1,0.0,1.0,0.0
2,657,37,0,163607.18,1,0,1,0.0,1.0,0.0
3,584,29,7,105204.01,1,0,1,1.0,0.0,0.0
4,803,26,4,0.0,2,1,1,0.0,0.0,0.0


In [21]:
## Applying standardization
scaler = StandardScaler()

In [29]:
x_train = scaler.fit_transform(x_train)

In [31]:
x_train=

array([[-0.08740826,  0.36617931,  0.68809371, ..., -0.5840152 ,
        -0.5605516 , -1.09158619],
       [ 1.49519409, -1.78859013, -0.35400737, ..., -0.5840152 ,
         1.78395709, -1.09158619],
       [ 0.06876961, -0.08745636, -1.74347549, ..., -0.5840152 ,
         1.78395709, -1.09158619],
       ...,
       [ 0.55812691,  0.70640607,  1.38282777, ..., -0.5840152 ,
         1.78395709, -1.09158619],
       [ 1.43272295, -0.08745636,  1.03546074, ..., -0.5840152 ,
         1.78395709,  0.91609807],
       [-0.99323987, -0.9947277 , -0.35400737, ..., -0.5840152 ,
        -0.5605516 ,  0.91609807]])

In [32]:
x_test = scaler.transform(x_test)

In [27]:
import joblib

# Save the OneHotEncoder
joblib.dump(onehot_encoder, "onehot_encoder.pkl")

['onehot_encoder.pkl']

In [33]:
## Save the StandardScaler
joblib.dump(scaler,"scaler.pkl")

['scaler.pkl']

**Model Training**

In [35]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout

from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

In [42]:
  model = Sequential([
      Dense(64, activation='relu', input_dim=10),
      Dropout(0.3),
      Dense(32, activation='relu'),
      Dropout(0.3),
      Dense(1, activation='sigmoid')
      ])

In [43]:
model.summary()

In [44]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
 ann_model = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test), verbose=1)


Epoch 1/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8649 - loss: 0.3286 - val_accuracy: 0.8504 - val_loss: 0.3478
Epoch 2/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8674 - loss: 0.3313 - val_accuracy: 0.8550 - val_loss: 0.3470
Epoch 3/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8630 - loss: 0.3317 - val_accuracy: 0.8513 - val_loss: 0.3466
Epoch 4/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8637 - loss: 0.3361 - val_accuracy: 0.8521 - val_loss: 0.3468
Epoch 5/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8663 - loss: 0.3351 - val_accuracy: 0.8542 - val_loss: 0.3441
Epoch 6/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8710 - loss: 0.3232 - val_accuracy: 0.8533 - val_loss: 0.3455
Epoch 7/100
[1m226/22

In [52]:
y_pred = model.predict(x_test).astype(int)
acc = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1888
           1       1.00      0.02      0.05       519

    accuracy                           0.79      2407
   macro avg       0.89      0.51      0.46      2407
weighted avg       0.83      0.79      0.70      2407



In [53]:
print(accuracy_score(y_test,y_pred))

0.7893643539675945
