# Chrun prediction using Machine Learning




Churn prediction is the process of using data and machine learning to anticipate which customers are likely to stop using a product or service. It helps businesses identify at-risk customers early, enabling targeted retention efforts. The goal is to reduce customer churn and enhance overall customer satisfaction.









# Importing Libraries

In [52]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [53]:
data=pd.read_csv("Churn_Modelling.csv")

In [54]:
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# 1-EDA

In [55]:
# Printing the chrun_modelling data in head and info
print(data.head())
print(data.info())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

# identifying missing values

In [56]:
# To identify the missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


# data preprocessing

In [57]:
print(data.columns)

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [58]:
X=data.drop('Exited', axis=1)
y=data['Exited']

In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
ct=ColumnTransformer(
    [
        ('ohe',OneHotEncoder(drop='first'),['Geography','Gender']),
        ('sc',StandardScaler(),['CreditScore','Age','Balance','EstimatedSalary']),
    ]
)
X_new = ct.fit_transform(X)
X_new

array([[ 0.        ,  0.        ,  0.        , ...,  0.29351742,
        -1.22584767,  0.02188649],
       [ 0.        ,  1.        ,  0.        , ...,  0.19816383,
         0.11735002,  0.21653375],
       [ 0.        ,  0.        ,  0.        , ...,  0.29351742,
         1.33305335,  0.2406869 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.27860412,
        -1.22584767, -1.00864308],
       [ 1.        ,  0.        ,  1.        , ...,  0.29351742,
        -0.02260751, -0.12523071],
       [ 0.        ,  0.        ,  0.        , ..., -1.04143285,
         0.85996499, -1.07636976]])

# Data Splitting

In [60]:

# Train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)

In [61]:
# testing out different models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [62]:
# Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.7935

In [63]:
data.shape

(10000, 14)

In [64]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature scaling

In [65]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training

In [66]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


# Model Prediction

In [67]:
y_pred = model.predict(X_test_scaled)


# Model Evaluation

In [68]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

Accuracy: 0.812
Confusion Matrix:
[[1505   99]
 [ 277  119]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      1604
           1       0.55      0.30      0.39       396

    accuracy                           0.81      2000
   macro avg       0.70      0.62      0.64      2000
weighted avg       0.79      0.81      0.79      2000



# model Intepretation

In [71]:
# Print lengths of feature_names and feature_importance
print("Length of feature_names:", len(feature_names))
print("Length of feature_importance:", len(feature_importance))

# Print details if there is a mismatch
if len(feature_names) != len(feature_importance):
    print("Mismatched lengths! Details:")
    print("Feature Names:", feature_names)
    print("Feature Importance:", feature_importance)
    


Length of feature_names: 13
Length of feature_importance: 7
Mismatched lengths! Details:
Feature Names: Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')
Feature Importance: [0.0263547  0.01105769 0.01647853 0.23360221 0.27685922 0.19055162
 0.24509602]
