# # Customer Churn Prediction

- Problem Statement & Dataset :- https://www.kaggle.com/shrutimechlearn/churn-modelling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
df.shape

(10000, 14)

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2627,2628,15707144,Onyeorulu,571,Germany,Male,25,6,82506.72,2,1,0,167705.07,0
3677,3678,15790442,Wright,631,Spain,Male,33,2,0.0,2,1,1,158268.84,0
2886,2887,15675328,Knight,449,France,Female,37,6,0.0,2,1,0,82176.48,0
295,296,15723654,Tsao,773,France,Male,25,2,135903.33,1,1,0,73656.38,0
6628,6629,15584967,Chiganu,596,Spain,Male,57,6,0.0,2,1,1,72402.0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [9]:
 df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# Data Preprocessing

In [11]:
df.drop(columns=['RowNumber','CustomerId','Surname'], inplace = True)

In [12]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [13]:
df['Geography'] = df['Geography'].astype('category').cat.codes

In [14]:
df['Gender'] = np.where(df['Gender']=='Male',1,0)

In [15]:
x = df.drop(columns = ['Exited']).values
y = df['Exited'].values

In [16]:
x.shape

(10000, 10)

In [17]:
y.shape

(10000,)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

# Model Building

In [20]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score

In [21]:
from xgboost import XGBClassifier

In [22]:
basic_model = XGBClassifier()
basic_model.fit(x_train,y_train)

In [24]:
def evaluate_model(model):
    print("Training Accuracy : ",model.score(x_train,y_train))
    print("Testing Accuracy : ",model.score(x_test,y_test))
    
    y_pred = model.predict(x_test)
    
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("F1 Score : ",f1_score(y_test,y_pred))

In [25]:
evaluate_model(basic_model)

Training Accuracy :  0.9614925373134329
Testing Accuracy :  0.8612121212121212
[[2529  128]
 [ 330  313]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      2657
           1       0.71      0.49      0.58       643

    accuracy                           0.86      3300
   macro avg       0.80      0.72      0.75      3300
weighted avg       0.85      0.86      0.85      3300

F1 Score :  0.577490774907749


# Hyperparameter Tuning

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
xgb_params = {
    'n_estimators' : [100, 200],
    'max_depth' : [4,5]
}

In [28]:
grid = GridSearchCV(XGBClassifier(), param_grid = xgb_params, scoring = "accuracy", cv=10)
grid.fit(x_train,y_train)

In [29]:
print("Best score ",grid.best_score_)

Best score  0.8532835820895522


In [30]:
print("Best parameters ",grid.best_params_)

Best parameters  {'max_depth': 4, 'n_estimators': 100}


In [31]:
model_1 = XGBClassifier(max_depth=4)
model_1.fit(x_train,y_train)

In [32]:
evaluate_model(model_1)

Training Accuracy :  0.9095522388059701
Testing Accuracy :  0.866969696969697
[[2539  118]
 [ 321  322]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      2657
           1       0.73      0.50      0.59       643

    accuracy                           0.87      3300
   macro avg       0.81      0.73      0.76      3300
weighted avg       0.86      0.87      0.86      3300

F1 Score :  0.5946445060018468


In [36]:
import pickle

In [40]:
ml_model = "XGB_model.pkl"

In [41]:
with open(ml_model, 'wb') as file:
    pickle.dump(model_1, file)

In [42]:
'''
To load the model back into your program, you can use the pickle.load() function. Here's an example:

with open(ml_model, 'rb') as file:
    loaded_model = pickle.load(file)

'''

"\nTo load the model back into your program, you can use the pickle.load() function. Here's an example:\n\nwith open(ml_model, 'rb') as file:\n    loaded_model = pickle.load(file)\n\n"