In [None]:
# **BANK CUSTOMER CHURN MODEL**

: 

## **Import Libraries**

In [None]:
import pandas as pd

: 

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

### **Import dataset**

In [5]:
df = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/refs/heads/main/Bank%20Churn%20Modelling.csv')

### **Description of data**

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.columns

## **Data processing**

In [None]:
df.duplicated('CustomerId').sum()

In [10]:
df = df.set_index('CustomerId')

In [None]:
df['Geography'].value_counts()

In [None]:
df.replace({'Geography': {'France':2,'Germany':1,'Spain':0}}, inplace = True)

In [None]:
df['Gender'].value_counts()

In [None]:
df.replace({'Gender': {'Male':0,'Female':1}}, inplace = True)

In [None]:
df['Num Of Products'].value_counts()

In [16]:
df.replace({'Num Of Products':{1:0,2:1,3:1,4:1}},inplace = True)

In [None]:
df.head()

In [None]:
df['Has Credit Card'].value_counts()

In [None]:
df['Is Active Member'].value_counts()

In [None]:
df.loc[(df['Balance']==0), 'Churn'].value_counts()

In [21]:
df['Zero Balance'] = np.where(df['Balance']>0, 1, 0)

In [None]:
df['Zero Balance'].hist()

In [None]:
df.groupby(['Churn', 'Geography']).count()

In [None]:
df.columns

## **Define Target Variable (y) and Feature Variables (X)**

### Normal Modeling

In [25]:
x = df.drop(['Surname','Churn'],axis = 1)

In [26]:
y = df['Churn']

In [None]:
x.shape, y.shape

In [None]:
df['Churn'].value_counts()

In [None]:
sns.countplot(x='Churn', data = df)

In [None]:
x.shape, y.shape

### Random Under Sampling

In [31]:
from imblearn.under_sampling import RandomUnderSampler

In [32]:
rus = RandomUnderSampler(random_state=2529)

In [None]:
x_rus,y_rus = rus.fit_resample(x,y)

In [None]:
x_rus.shape, y_rus.shape,x.shape,y.shape

In [None]:
y.value_counts()

In [None]:
y_rus.value_counts()

In [None]:
y_rus.plot(kind = 'hist')

### Random Over Sampling

In [38]:
from imblearn.over_sampling import RandomOverSampler

In [39]:
ros = RandomOverSampler(random_state=2529)

In [None]:
x_ros,y_ros = ros.fit_resample(x,y)

In [None]:
x_ros.shape, y_ros.shape,x.shape,y.shape

In [None]:
y.value_counts()

In [None]:
y_ros.value_counts()

In [None]:
y_ros.plot(kind = 'hist')

## **Train Test Split**

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 2529)

In [47]:
x_train_rus,x_test_rus,y_train_rus,y_test_rus = train_test_split(x_rus,y_rus,test_size = 0.3, random_state = 2529)

In [48]:
x_train_ros,x_test_ros,y_train_ros,y_test_ros = train_test_split(x_ros,y_ros,test_size = 0.3, random_state = 2529)

In [None]:
x_train.columns

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
sc = StandardScaler()

In [52]:
x_train[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_train[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

In [53]:
x_test[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_test[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

In [54]:
x_train_rus[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_train_rus[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

In [55]:
x_test_rus[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_test_rus[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

In [56]:
x_train_ros[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_train_ros[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

In [57]:
x_test_ros[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']] = sc.fit_transform(x_test_ros[['CreditScore', 'Age', 'Tenure', 'Balance', 'Estimated Salary']])

## **Model Evaluation and Prediction**

In [58]:
from sklearn.svm import SVC

### **Normal model**

In [59]:
svc = SVC()

In [None]:
svc.fit(x_train,y_train)

In [61]:
y_pred = svc.predict(x_test)

In [62]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

### Hyperparameter tuning(Normal)

In [65]:
from sklearn.model_selection import GridSearchCV

In [66]:
param_grid = {'C' : [0.1,1,10],
              'gamma' : [1,0.1,0.01],
              'kernel' : ['rbf'],
              'class_weight' : ['balanced']}

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,cv=2)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_estimator_)

In [69]:
grid_predictions = grid.predict(x_test)

In [None]:
confusion_matrix(y_test,grid_predictions)

In [None]:
print(classification_report(y_test,grid_predictions))

### **Random Under Sampling**

In [None]:
svc.fit(x_train_rus,y_train_rus)

In [73]:
y_pred_rus = svc.predict(x_test_rus)

In [None]:
confusion_matrix(y_test_rus,y_pred_rus)

In [None]:
print(classification_report(y_test_rus,y_pred_rus))

### Hyperparameter Tuning(RUS)

In [None]:
grid_rus = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,cv=2)
grid_rus.fit(x_train_rus,y_train_rus)

In [None]:
print(grid_rus.best_estimator_)

In [78]:
grid_pred_rus = grid_rus.predict(x_test_rus)

In [None]:
confusion_matrix(y_test_rus,grid_pred_rus)

In [None]:
print(classification_report(y_test_rus,grid_pred_rus))

### **Random Over Sampling**

In [None]:
svc.fit(x_train_ros,y_train_ros)

In [82]:
y_pred_ros = svc.predict(x_test_ros)

In [None]:
confusion_matrix(y_test_ros,y_pred_ros)

In [None]:
print(classification_report(y_test_ros,y_pred_ros))

### Hyperparameter Tuning(ROS)

In [None]:
grid_ros = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,cv=2)
grid_ros.fit(x_train_ros,y_train_ros)

In [None]:
print(grid_ros.best_estimator_)

In [87]:
grid_pred_ros = grid_ros.predict(x_test_ros)

In [None]:
confusion_matrix(y_test_ros,grid_pred_ros)

In [None]:
print(classification_report(y_test_ros,grid_pred_ros))

***Summary***


  We can cconclude that for our problem the Random Over Sampling is is giving us the most accurate result here after hyperparameter tuning.It gives us an accuracy of 92% with precision 0.88 and a recall value of 0.97 which is better than the Normal model and Random Under Sampling model.

