# Customer Churn Prediction

## Importing Required Libraries

In [24]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

## Reading the dataset

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Removing the Columns not required for the prediction

In [3]:
df = df.drop(['RowNumber','CustomerId'], axis=1)
df.head()

Unnamed: 0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df = df.drop(['Surname'], axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Checking for NULL Values

In [5]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

## Handling Categorical Data

In [6]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

### Encoding categorical data

In [7]:
df = pd.get_dummies(df, columns=['Geography','Gender'], drop_first=True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


## Separating dependent and independent variables

In [8]:
X = df.drop(['Exited'], axis=1)
y = df['Exited']

## Splitting the data into train and test set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling and Normalizing the data

In [10]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Model Building and Prediction

# Logistic Regression

In [11]:
logreg = LogisticRegression()

logreg.fit(scaled_X_train,y_train)

In [12]:
y_pred_log = logreg.predict(scaled_X_test)
y_pred_log

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
confusion_matrix(y_test,y_pred_log)

array([[1550,   57],
       [ 318,   75]], dtype=int64)

In [14]:
print(classification_report(y_test,y_pred_log))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.57      0.19      0.29       393

    accuracy                           0.81      2000
   macro avg       0.70      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



In [15]:
accuracy_score(y_test,y_pred_log)

0.8125

## 81% is not very good, let's try a different algorithm

# Decision Tree

In [16]:
dc = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
dc.fit(scaled_X_train,y_train)

In [17]:
y_pred_dect = dc.predict(scaled_X_test)

In [18]:
confusion_matrix(y_test,y_pred_dect)

array([[1527,   80],
       [ 223,  170]], dtype=int64)

In [19]:
print(classification_report(y_test,y_pred_dect))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1607
           1       0.68      0.43      0.53       393

    accuracy                           0.85      2000
   macro avg       0.78      0.69      0.72      2000
weighted avg       0.83      0.85      0.83      2000



## 85% is better, but we can try a different algorithm

In [20]:
randf = RandomForestClassifier(n_estimators=10,random_state=0)

randf.fit(scaled_X_train,y_train)

In [21]:
y_pred_randf = randf.predict(scaled_X_test)

In [22]:
confusion_matrix(y_test,y_pred_randf)

array([[1549,   58],
       [ 226,  167]], dtype=int64)

In [23]:
print(classification_report(y_test,y_pred_randf))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92      1607
           1       0.74      0.42      0.54       393

    accuracy                           0.86      2000
   macro avg       0.81      0.69      0.73      2000
weighted avg       0.85      0.86      0.84      2000



## 86% is almost the same, Let's try some boosting method

# Adaboost Classifier

In [25]:
ada_boost = AdaBoostClassifier(n_estimators=50, random_state=0)

ada_boost.fit(scaled_X_train,y_train)

In [26]:
y_pred_adab = ada_boost.predict(scaled_X_test)

In [27]:
confusion_matrix(y_test,y_pred_adab)

array([[1523,   84],
       [ 201,  192]], dtype=int64)

In [28]:
print(classification_report(y_test,y_pred_adab))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1607
           1       0.70      0.49      0.57       393

    accuracy                           0.86      2000
   macro avg       0.79      0.72      0.74      2000
weighted avg       0.85      0.86      0.85      2000



# Gradient Boosting Classifier

In [29]:
grad_boost = GradientBoostingClassifier(n_estimators=50,random_state=0)

grad_boost.fit(scaled_X_train,y_train)

In [30]:
y_pred_gradb = grad_boost.predict(scaled_X_test)

In [31]:
confusion_matrix(y_test,y_pred_gradb)

array([[1553,   54],
       [ 215,  178]], dtype=int64)

In [32]:
print(classification_report(y_test,y_pred_gradb))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.45      0.57       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.86      0.87      0.85      2000



## So, 86% is the best accuracy we are having with this dataset using machine learning algorithms