In [1]:
#import important libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, accuracy_score
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
#read dataset csv file.
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
# find the features and observation in the data set.
df.shape

(10000, 14)

In [4]:
# find the datatypes of the features and target variable. 
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
dropped = ["RowNumber", "CustomerId", "Surname"]

In [7]:
for i in range(len(dropped)):
    del df[dropped[i]]
df["Balance"] = np.ceil(df["Balance"]).astype(int)
df["EstimatedSalary"] = np.ceil(df["EstimatedSalary"]).astype(int)
df = pd.get_dummies(df)

In [8]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0,1,1,1,101349,1,1,0,0,1,0
1,608,41,1,83808,1,0,1,112543,0,0,0,1,1,0
2,502,42,8,159661,3,1,0,113932,1,1,0,0,1,0
3,699,39,1,0,2,0,0,93827,0,1,0,0,1,0
4,850,43,2,125511,1,1,1,79085,0,0,0,1,1,0


In [9]:
# Remove the outliers
def remove_outlier(feature):
    first_q = np.percentile(df[feature], 25)
    third_q = np.percentile(df[feature], 75)
    IQR = third_q - first_q
    IQR *= 1.5
    minimum = first_q - IQR
    maximum = third_q + IQR

    mean = df[feature].median()

    df.loc[df[feature] < minimum, feature] = mean
    df.loc[df[feature] > maximum, feature] = mean


outliers = ["CreditScore", "Age", "NumOfProducts"]

In [10]:
for i in range(len(outliers)):
    remove_outlier(outliers[i])
X = df.iloc[:, df.columns != "Exited"]
y = df["Exited"]

In [11]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619.0,42.0,2,0,1.0,1,1,101349,1,1,0,0,1,0
1,608.0,41.0,1,83808,1.0,0,1,112543,0,0,0,1,1,0
2,502.0,42.0,8,159661,3.0,1,0,113932,1,1,0,0,1,0
3,699.0,39.0,1,0,2.0,0,0,93827,0,1,0,0,1,0
4,850.0,43.0,2,125511,1.0,1,1,79085,0,0,0,1,1,0


In [None]:
# Convert data-set from data-frame to numpy
x_data = np.array(X)
y_data = np.array(y)
y_data = y_data.reshape(len(y_data), 1)

# Normalization of the data-set
x_data = (x_data - x_data.mean()) / x_data.std()

# Split the data set into train and test data set.
x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
    x_data, y_data, test_size=0.40, shuffle=True)

# Define the KNN value (it can be change to improve the accuracy of the model)
Knn = 3

y_pred = []

# Training the model along with prediction
for i in range(0, len(x_data_test)):
    a = b = c = 0
    distance = []
    temp = 0
    for j in range(0, len(x_data_train)):
        for k in range(0, x_data_train.shape[1]):
            temp += pow(x_data_test[i][k] - x_data_train[j][k], 2)
        d1 = [math.sqrt(temp), y_data_train[j][0]]
        distance.append(d1)
    distance = sorted(distance)
    distance = distance[0:Knn]
    for l in range(0, len(distance)):
        if distance[l][1] == 1:
            a += 1
        elif distance[l][1] == 0:
            b += 1
    if a == max(a, b):
        y_pred.append(1)
    else:
        y_pred.append(1)

# Print the accuracy score using sklearn function.
print("Train accuracy", accuracy_score(y_data_train, y_data_train) * 100)
print("Test accuracy", accuracy_score(y_data_test, y_pred) * 100)

In [None]:
# Using sklearn linear regression model
model = KNeighborsClassifier(n_neighbors=4)
reg = model.fit(x_data_train,y_data_train)
reg.score(x_data_test, y_data_test)
y_pred =  reg.predict(x_data_test)
print("Train accuracy", accuracy_score(y_data_train, y_data_train) * 100)
print("Test accuracy", accuracy_score(y_data_test, y_pred) * 100)