# Content


This data set contains details of a bank's customers and the target variable is a binary variable reflecting the fact whether the customer left the bank (closed his account) or he continues to be a customer.



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Churn_Modelling.csv')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
df.nunique()

RowNumber          10000
CustomerId         10000
Surname             2932
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

In [6]:
corr_matrix = df.corr()
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,1.0,0.004202,0.00584,0.000783,-0.006495,-0.009067,0.007246,0.000599,0.012044,-0.005988,-0.016571
CustomerId,0.004202,1.0,0.005308,0.009497,-0.014883,-0.012419,0.016972,-0.014025,0.001665,0.015271,-0.006248
CreditScore,0.00584,0.005308,1.0,-0.003965,0.000842,0.006268,0.012238,-0.005458,0.025651,-0.001384,-0.027094
Age,0.000783,0.009497,-0.003965,1.0,-0.009997,0.028308,-0.03068,-0.011721,0.085472,-0.007201,0.285323
Tenure,-0.006495,-0.014883,0.000842,-0.009997,1.0,-0.012254,0.013444,0.022583,-0.028362,0.007784,-0.014001
Balance,-0.009067,-0.012419,0.006268,0.028308,-0.012254,1.0,-0.30418,-0.014858,-0.010084,0.012797,0.118533
NumOfProducts,0.007246,0.016972,0.012238,-0.03068,0.013444,-0.30418,1.0,0.003183,0.009612,0.014204,-0.04782
HasCrCard,0.000599,-0.014025,-0.005458,-0.011721,0.022583,-0.014858,0.003183,1.0,-0.011866,-0.009933,-0.007138
IsActiveMember,0.012044,0.001665,0.025651,0.085472,-0.028362,-0.010084,0.009612,-0.011866,1.0,-0.011421,-0.156128
EstimatedSalary,-0.005988,0.015271,-0.001384,-0.007201,0.007784,0.012797,0.014204,-0.009933,-0.011421,1.0,0.012097


In [7]:
len(df['Surname'].unique())

2932

In [8]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [9]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [10]:
Geography = pd.get_dummies(df['Geography'], drop_first = True)

In [11]:
Geography

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,1,0


In [12]:
Gender = pd.get_dummies(df['Gender'], drop_first = True)

In [13]:
Gender

Unnamed: 0,Male
0,0
1,0
2,0
3,0
4,0
...,...
9995,1
9996,1
9997,0
9998,1


In [14]:
df = pd.concat([df,Geography,Gender], axis=1)

In [15]:
df = df.drop(columns = ['RowNumber','CustomerId','Surname','Gender','Geography'], axis = 1)

In [16]:
df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain,Male
0,619,42,2,0.00,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.80,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.00,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,1,0,0,0
9998,772,42,3,75075.31,2,1,0,92888.52,1,1,0,1


# EDA

# Model

In [17]:
X = df.drop(columns = ['Exited'])
y = df['Exited']

In [18]:
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [39]:
lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
svc_clf = SVC()
xgb = XGBClassifier()

In [40]:
models = {'LR_CLF' : lr_clf,
          'DT_CLF' : dt_clf,
          'RF_CLF' : rf_clf,
          'SVC_CLF' : svc_clf,
          'XGB' : xgb
    
}

In [42]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    
    print('{} : \nAccuracy of train : {} Accuracy of test : {}'.format(name, accuracy_score(y_train, y_train_pred),
                                                               accuracy_score(y_test,y_test_pred )))
    print('Precision of train : {} Precision of test : {}'.format(precision_score(y_train, y_train_pred),
                                                               precision_score(y_test,y_test_pred )))
    print('Recall of train : {} Recall of test : {}'.format(recall_score(y_train, y_train_pred),
                                                               recall_score(y_test,y_test_pred )))
    print('\n')

LR_CLF : 
Accuracy of train : 0.787375 Accuracy of test : 0.8005
Precision of train : 0.3817427385892116 Precision of test : 0.45161290322580644
Recall of train : 0.05596107055961071 Recall of test : 0.07124681933842239


DT_CLF : 
Accuracy of train : 1.0 Accuracy of test : 0.773
Precision of train : 1.0 Precision of test : 0.4317673378076063
Recall of train : 1.0 Recall of test : 0.4910941475826972


RF_CLF : 
Accuracy of train : 1.0 Accuracy of test : 0.868
Precision of train : 1.0 Precision of test : 0.7698744769874477
Recall of train : 1.0 Recall of test : 0.4681933842239186


SVC_CLF : 
Accuracy of train : 0.7945 Accuracy of test : 0.8035
Precision of train : 0.0 Precision of test : 0.0
Recall of train : 0.0 Recall of test : 0.0




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XGB : 
Accuracy of train : 0.9485 Accuracy of test : 0.858
Precision of train : 0.963855421686747 Precision of test : 0.6953405017921147
Recall of train : 0.7785888077858881 Recall of test : 0.49363867684478374




In [None]:
rf_clf_tune = RandomForestClassifier()