Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Import data

In [None]:
dataset = pd.read_csv('Customer-Churn-Records.csv')

Remove Missing Data

In [None]:
dataset = dataset.dropna()

Split data into inputs/outputs

In [None]:
X = dataset.iloc[:,3:]
del X['Exited']
print(X)

y = dataset.iloc[:,13]
print(y)

      CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619    France  Female   42       2       0.00              1   
1             608     Spain  Female   41       1   83807.86              1   
2             502    France  Female   42       8  159660.80              3   
3             699    France  Female   39       1       0.00              2   
4             850     Spain  Female   43       2  125510.82              1   
...           ...       ...     ...  ...     ...        ...            ...   
9995          771    France    Male   39       5       0.00              2   
9996          516    France    Male   35      10   57369.61              1   
9997          709    France  Female   36       7       0.00              1   
9998          772   Germany    Male   42       3   75075.31              2   
9999          792    France  Female   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  Complain  \
0

LabelEncoder & OneHotEncoder

In [None]:
# handle geography column w/ one hot encoding
encoded_geography = pd.get_dummies(X['Geography'], prefix='Geography')
X = pd.concat([X.drop('Geography', axis=1), encoded_geography], axis=1)

# handle gender column w/ one hot encoding
encoded_geography = pd.get_dummies(X['Gender'], prefix='Gender')
X = pd.concat([X.drop('Gender', axis=1), encoded_geography], axis=1)

# handle card type w/ label encoder
X['Card Type'].unique()
label_order = {'SILVER': 0, 'GOLD': 1, 'PLATINUM': 2, 'DIAMOND': 3}
X['Card Type'] = X['Card Type'].map(label_order)
print(X)

      CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0             619   42       2       0.00              1          1   
1             608   41       1   83807.86              1          0   
2             502   42       8  159660.80              3          1   
3             699   39       1       0.00              2          0   
4             850   43       2  125510.82              1          1   
...           ...  ...     ...        ...            ...        ...   
9995          771   39       5       0.00              2          1   
9996          516   35      10   57369.61              1          1   
9997          709   36       7       0.00              1          0   
9998          772   42       3   75075.31              2          1   
9999          792   28       4  130142.79              1          1   

      IsActiveMember  EstimatedSalary  Complain  Satisfaction Score  \
0                  1        101348.88         1                   2   
1    

Split data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
print(X_train)
print(X_test)

[[ 0.16958176 -0.46460796  0.00666099 ...  1.74309049  1.09168714
  -1.09168714]
 [-2.30455945  0.30102557 -1.37744033 ... -0.57369368 -0.91601335
   0.91601335]
 [-1.19119591 -0.94312892 -1.031415   ... -0.57369368  1.09168714
  -1.09168714]
 ...
 [ 0.9015152  -0.36890377  0.00666099 ... -0.57369368 -0.91601335
   0.91601335]
 [-0.62420521 -0.08179119  1.39076231 ...  1.74309049  1.09168714
  -1.09168714]
 [-0.28401079  0.87525072 -1.37744033 ... -0.57369368  1.09168714
  -1.09168714]]
[[-0.56129438 -0.39401698  0.9869706  ... -0.57427105  1.11339196
  -1.11339196]
 [-1.33847768  0.07611425 -1.08432132 ... -0.57427105  1.11339196
  -1.11339196]
 [ 0.58347561  0.26416674  0.9869706  ...  1.74133801  1.11339196
  -1.11339196]
 ...
 [-0.76084144 -0.29999074 -1.42953664 ...  1.74133801 -0.8981563
   0.8981563 ]
 [-0.0046631  -0.48804323 -0.39389068 ... -0.57427105 -0.8981563
   0.8981563 ]
 [-0.81335383 -0.86414821  0.9869706  ... -0.57427105 -0.8981563
   0.8981563 ]]


Train model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

Make predictions

In [None]:
predictions = classifier.predict(X_test)
print(np.concatenate((y_test.values.reshape(-1,1), predictions.reshape(-1,1)), 1))

[[0 0]
 [1 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


Evaluate classification model

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test,predictions))
cm = confusion_matrix(y_test, predictions)
print(cm)

0.9995
[[1595    0]
 [   1  404]]
