dataset = https://www.kaggle.com/datasets/shubhammeshram579/bank-customer-churn-prediction

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
df = pd.read_csv('Churn_Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.00,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.80,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.00,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997,9998,15584532,Liu,709,France,Female,36.0,7,0.00,1,0.0,1.0,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
9999,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
10000,10000,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [54]:
df.dropna(inplace=True)

In [55]:
for i in df.columns:
  if df[i].dtypes == object:
    print(i,df[i].unique())

Surname ['Hargrave' 'Hill' 'Onio' ... 'Kashiwagi' 'Aldridge' 'Burbidge']
Geography ['France' 'Spain' 'Germany']
Gender ['Female' 'Male']


In [56]:
df['Gender'] = df['Gender'].map({'Male':1,'Female':0})
df['Geography'] = df['Geography'].map({'France':0,'Spain':1,'Germany':2})

In [57]:
df.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)

In [58]:
df['Exited'].value_counts()

Exited
0    7960
1    2038
Name: count, dtype: int64

In [59]:
from imblearn.over_sampling import SMOTE

x = df.drop('Exited', axis = 1)
y = df['Exited']

smote = SMOTE(random_state = 42)

x_resampled, y_resampled = smote.fit_resample(x,y)

df = pd.concat([x_resampled , y_resampled], axis = 1)

df['Exited'].value_counts()

Exited
1    7960
0    7960
Name: count, dtype: int64

In [60]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = df.drop('Exited',axis=1)
y = df['Exited']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_features=5, max_depth=5, random_state=42)

model.fit(train_X, train_y)

pred_y = model.predict(test_X)

accuracy = accuracy_score(test_y, pred_y)
print("Gradient Boosting Classifier accuracy is : {:.4f}".format(accuracy))
print("\nClassification Report:")
print(classification_report(test_y, pred_y))

Gradient Boosting Classifier accuracy is : 0.8675

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1623
           1       0.89      0.83      0.86      1561

    accuracy                           0.87      3184
   macro avg       0.87      0.87      0.87      3184
weighted avg       0.87      0.87      0.87      3184



In [61]:
import pickle as pkl
pkl.dump(model, open('model.pkl','wb'))

In [62]:
X.to_csv('X.csv',index=False)

In [63]:
x.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'],
      dtype='object')