Import some of needed libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from collections import Counter

Now we need to import the dataset (The link of the dataset: https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction/data)

In [2]:
original_dataset = pd.read_csv("Churn_Modelling.csv")
original_dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


Drop unnecessary features

In [10]:
features = original_dataset.drop(columns=['RowNumber','CustomerId','Surname','Gender','Exited'])
label = original_dataset['Exited']

Convert the Geography feature into numerical feature

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
features['Geography'] = label_encoder.fit_transform(features['Geography'])

Check if the dataset is imbalanced

In [12]:
# get the number of unique values in the label column ("Exited")
unique_labels = original_dataset['Exited'].value_counts()

# get the percentages of them
percentages = unique_labels / len(original_dataset) * 100

# print the result out
print(percentages)

Exited
0    79.63
1    20.37
Name: count, dtype: float64


Split the features and labels into training and testing

In [13]:
from sklearn.model_selection import train_test_split

features_train , features_test , label_train , label_test = train_test_split(features,label,test_size=0.2,random_state=42,stratify=label)

Using SMOTE to oversample the minor class

In [23]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
features_train_smote, label_train_smote = smote.fit_resample(features_train, label_train)

Import and train the model 

In [27]:
from sklearn.linear_model import LogisticRegression

# Initialize the model with class weighting
logistic_model = LogisticRegression(max_iter=250,C=0.5, random_state=42)
logistic_model.fit(features_train_smote, label_train_smote)

Make the predictions 

In [28]:
logistic_predictions = logistic_model.predict(features_test)

Evaluate the model

In [29]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(label_test, logistic_predictions))

              precision    recall  f1-score   support

           0       0.89      0.63      0.74      1593
           1       0.32      0.68      0.44       407

    accuracy                           0.64      2000
   macro avg       0.60      0.66      0.59      2000
weighted avg       0.77      0.64      0.68      2000



Trying to fit another model 

In [81]:
from sklearn.ensemble import RandomForestClassifier


RF_model = RandomForestClassifier(500,min_samples_leaf=4,class_weight='balanced',random_state=42, n_jobs= -1,)

# Train the model
RF_model.fit(features_train, label_train)

Predicitons of the Random Forest model

In [82]:
RF_predictions = RF_model.predict(features_test)

Evaluate the Ranodm Forest model while using SMOTE without the hyperparameter (class_weight = 'balance')

In [77]:
print(classification_report(label_test,RF_predictions))

              precision    recall  f1-score   support

           0       0.91      0.84      0.87      1593
           1       0.52      0.66      0.58       407

    accuracy                           0.81      2000
   macro avg       0.71      0.75      0.73      2000
weighted avg       0.83      0.81      0.81      2000



The results of the Random forest model without using SMOTE and with the hyperparamter (class_weight = 'balance')

In [83]:
print(classification_report(label_test,RF_predictions))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1593
           1       0.65      0.59      0.62       407

    accuracy                           0.85      2000
   macro avg       0.77      0.75      0.76      2000
weighted avg       0.85      0.85      0.85      2000

