# Customer Churn Prediction Using Machine Learning

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Churn.csv')
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

## Check for any Missing Values


In [None]:
missing_data = dataset.isnull().sum()
print(missing_data)

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 ... 1 1 101348.88]
 [0.0 0.0 1.0 ... 0 1 112542.58]
 [1.0 0.0 0.0 ... 1 0 113931.57]
 ...
 [1.0 0.0 0.0 ... 0 1 42085.58]
 [0.0 1.0 0.0 ... 1 0 92888.52]
 [1.0 0.0 0.0 ... 1 0 38190.78]]


## Check Number of columns after One-Hot Encoding

In [None]:
num_columns = X.shape[1]
print("Number of columns after transformation:", num_columns)

Number of columns after transformation: 13


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, [5,6,8,12]] = sc.fit_transform(X_train[:, [5,6,8,12]])
X_test[:, [5,6,8,12]] = sc.transform(X_test[:, [5,6,8,12]])

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X_train[:, [5,6,8,12]] = sc.fit_transform(X_train[:, [5,6,8,12]])
X_test[:, [5,6,8,12]] = sc.transform(X_test[:, [5,6,8,12]])

In [None]:
print(X_train[0 ,:])

[1.0 0.0 0.0 0.0 1.0 0.9999999999999999 0.6081081081081081 8
 0.6768986164860802 1 0 0 0.9205663140829949]


# Check Number of rows after split

In [None]:
print(X_train.shape[0])

8000


## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state = 0, C = 0.1, solver = 'saga', penalty = 'l1')
classifier1.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier1.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1560   58]
 [ 317   65]]


0.8125

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier1, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 81.16 %
Standard Deviation: 1.35 %


## Applying Grid Search to find the best parameters for Logistic Regression


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [0.1], 'penalty': ['l1', 'l2'], 'solver': ['saga'], 'class_weight': [None, 'balanced']
               }]
grid_search = GridSearchCV(estimator = classifier1,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 81.31 %
Best Parameters: {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'}


## Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 19, metric = 'minkowski', p = 1, leaf_size = 10, weights = 'distance')
classifier2.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier2.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1557   61]
 [ 313   69]]


0.813

## Applying Grid Search to find the best parameters for K-NN


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_neighbors': [19, 21, 25], 'weights': ['distance'], 'p': [1],
    'leaf_size': [10] }]
grid_search = GridSearchCV(estimator = classifier2,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 80.97 %
Best Parameters: {'leaf_size': 10, 'n_neighbors': 19, 'p': 1, 'weights': 'distance'}


## Training the Kernel SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier3 = SVC(random_state = 0, C = 10, kernel = 'rbf', gamma = 'auto')
classifier3.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier3.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1571   47]
 [ 264  118]]


0.8445

## Applying Grid Search to find the best parameters for SVM

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [10], 'kernel': ['rbf'],
               'gamma' : [1, 'auto']
               }]
grid_search = GridSearchCV(estimator = classifier3,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 85.72 %
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}


## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier4 = GaussianNB()
classifier4.fit(X_train, y_train)

## Making the Confusion Matrix


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier4.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1476  142]
 [ 240  142]]


0.809

## Applying Grid Search to find the best parameters for Naive Bayes


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'var_smoothing': [1e-9, 1e-11] }]
grid_search = GridSearchCV(estimator = classifier4,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 81.49 %
Best Parameters: {'var_smoothing': 1e-09}


## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier5 = DecisionTreeClassifier(random_state = 0, criterion = 'entropy',
                                     max_depth = 10, min_samples_split = 5,
                                     max_leaf_nodes = 40)
classifier5.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier5.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1535   83]
 [ 208  174]]


0.8545

## Applying Grid Search to find the best parameters for Decision Tree

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'criterion': ['entropy'],
               'max_depth': [10],
               'min_samples_split': [5],
               'max_leaf_nodes': [40]}]
grid_search = GridSearchCV(estimator = classifier5,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 85.88 %
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': 40, 'min_samples_split': 5}


## Training the Random Forest Classification model on the Training set


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier6 = RandomForestClassifier(n_estimators = 300, random_state = 0, criterion = 'entropy',
                                     max_depth = 10, min_samples_split = 10, min_samples_leaf = 2)
classifier6.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier6.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1557   61]
 [ 224  158]]


0.8575

## Applying Grid Search to find the best parameters for Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [100],
               'criterion': ['entropy'],
               'max_depth': [10, 5],
               'min_samples_split': [10],
               'min_samples_leaf': [2]}]
grid_search = GridSearchCV(estimator = classifier6,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 86.42 %
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
