In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
dataset = pd.read_csv("Churn_Modelling.csv", index_col = "RowNumber")
dataset.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
X_columns = dataset.columns.tolist()[2:12]
Y_columns = dataset.columns.tolist()[-1:]
print(X_columns)
print(Y_columns)

['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
['Exited']


In [4]:
X = dataset[X_columns].values
Y = dataset[Y_columns].values

In [5]:
from sklearn.preprocessing import LabelEncoder
X_column_transformer = LabelEncoder()
X[:, 1] = X_column_transformer.fit_transform(X[:, 1])

In [6]:
X[:, 2] = X_column_transformer.fit_transform(X[:, 2])

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [8]:
pipeline = Pipeline(
    [
        ('Categorizer', ColumnTransformer(
            [
                ("Gender Label Encoder", OneHotEncoder(categories = 'auto', drop = 'first'), [2]),
                ("Geography Label Encoder", OneHotEncoder(categories = 'auto', drop = 'first'), [1])
            ],
            remainder = 'passthrough', n_jobs = 1
        )),
        ("Normalizer", StandardScaler())
    ]
)

In [9]:
X = pipeline.fit_transform(X)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

In [24]:
classifier = Sequential()

In [26]:
# classifier.add(Dense(6, activation = 'relu', input_shape = (X_train.shape[1], )))
# classifier.add(Dropout(rate = 0.1))
classifier.add(Input(shape = (X_train.shape[1],)))
classifier.add(Dense(6, activation = 'relu'))
classifier.add(Dropout(rate = 0.1))

In [28]:
classifier.add(Dense(1, activation = 'sigmoid'))

In [30]:
classifier.summary()

In [32]:
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [34]:
history = classifier.fit(X_train, y_train, batch_size = 32, epochs = 200, validation_split = 0.1, verbose = 2)

Epoch 1/200
225/225 - 3s - 15ms/step - accuracy: 0.4872 - loss: 0.7757 - val_accuracy: 0.6475 - val_loss: 0.6322
Epoch 2/200
225/225 - 1s - 2ms/step - accuracy: 0.7404 - loss: 0.5676 - val_accuracy: 0.7800 - val_loss: 0.5302
Epoch 3/200
225/225 - 0s - 2ms/step - accuracy: 0.7881 - loss: 0.5085 - val_accuracy: 0.7962 - val_loss: 0.4810
Epoch 4/200
225/225 - 1s - 2ms/step - accuracy: 0.8032 - loss: 0.4760 - val_accuracy: 0.8100 - val_loss: 0.4504
Epoch 5/200
225/225 - 1s - 2ms/step - accuracy: 0.8044 - loss: 0.4577 - val_accuracy: 0.8163 - val_loss: 0.4298
Epoch 6/200
225/225 - 1s - 2ms/step - accuracy: 0.8126 - loss: 0.4442 - val_accuracy: 0.8338 - val_loss: 0.4141
Epoch 7/200
225/225 - 1s - 2ms/step - accuracy: 0.8169 - loss: 0.4322 - val_accuracy: 0.8425 - val_loss: 0.4030
Epoch 8/200
225/225 - 1s - 2ms/step - accuracy: 0.8204 - loss: 0.4254 - val_accuracy: 0.8413 - val_loss: 0.3936
Epoch 9/200
225/225 - 0s - 2ms/step - accuracy: 0.8235 - loss: 0.4175 - val_accuracy: 0.8438 - val_loss

In [35]:
y_pred = classifier.predict(X_test)
print(y_pred[:5])

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[[0.30269325]
 [0.2955531 ]
 [0.19186662]
 [0.08212812]
 [0.06964915]]


In [36]:
y_pred = (y_pred > 0.5).astype(int)
print(y_pred[:5])

[[0]
 [0]
 [0]
 [0]
 [0]]


In [37]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1550   45]
 [ 222  183]]


In [38]:
print(((cm[0][0] + cm[1][1])* 100) / len(y_test), '% of data was classified correctly')


86.65 % of data was classified correctly
