# Importing all relevant libraries

In [1]:
import pandas as pd
import os
import numpy as np
import random
import matplotlib.pyplot as plt
import sklearn
import scipy

# Get The DATA
#### 1. Loading Data into our Notebook

In [2]:
def load_data(csv_path="C:/Users/AA/Documents/Python-Projects/AIC-Task/Churn_Modelling.csv"):
    return pd.read_csv(csv_path)

churn_model = load_data()
churn_model.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
churn_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 976.6+ KB


In [4]:
churn_model.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
churn_model.apply(lambda x: sum(x.isnull()), axis=0)# checking for null values in data


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
churn_model["Gender"].value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

### 2. Splitting the data

In [7]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [8]:
train_set, test_set = split_train_test(churn_model, 0.2)
print(len(train_set), "train +", len(test_set), "test")

8000 train + 2000 test


### 2. Copying the labels(predictions)

In [9]:
labels_train = train_set['Exited']
labels_test = test_set['Exited']

# Prepare the Data for Machine Learning Algorithms
### 1. Data Cleaning

In [10]:
train_set_copy = train_set.drop(["RowNumber","CustomerId","Surname","Exited"],axis = 1)
test_set_copy = test_set.drop(["RowNumber","CustomerId","Surname","Exited"],axis = 1)

train_set_copy

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
3033,787,France,Female,85,10,0.00,2,1,1,116537.96
175,484,Spain,Female,35,7,133868.21,1,1,1,27286.10
2644,736,France,Male,27,5,51522.75,1,0,1,192131.77
4869,669,France,Male,50,4,149713.61,3,1,1,124872.42
4110,558,France,Male,40,6,0.00,2,1,0,173844.89
...,...,...,...,...,...,...,...,...,...,...
4178,510,France,Female,32,1,0.00,2,0,1,28515.17
1138,468,Germany,Male,42,9,181627.14,2,1,0,172668.39
2741,535,France,Female,38,8,0.00,2,1,0,136620.64
9628,662,France,Female,22,9,0.00,2,1,1,44377.65


In [11]:
train_set_copy["Gender"]= train_set_copy["Gender"].replace("Male",1)
train_set_copy["Gender"]= train_set_copy["Gender"].replace("Female",0)

test_set_copy["Gender"]= test_set_copy["Gender"].replace("Male",1)
test_set_copy["Gender"]= test_set_copy["Gender"].replace("Female",0)

### 2. Appliying Pipeline

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

num_attribs = ["CreditScore","Age","Tenure","Balance","NumOfProducts","EstimatedSalary"]
cat_attribs = ["Geography"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

train_set_prepared = full_pipeline.fit_transform(train_set_copy)
test_set_prepared = full_pipeline.fit_transform(test_set_copy)
pd.DataFrame(train_set_prepared)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.405724,4.419055,1.718573,-1.223542,0.805057,0.281060,1.0,0.0,0.0
1,-1.716602,-0.374613,0.683054,0.916236,-0.912397,-1.267964,0.0,0.0,1.0
2,0.880184,-1.141600,-0.007292,-0.399991,-0.912397,1.593040,1.0,0.0,0.0
3,0.189769,1.063487,-0.352465,1.169512,2.522511,0.425710,1.0,0.0,0.0
4,-0.954053,0.104754,0.337881,-1.223542,0.805057,1.275659,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
7995,-1.448679,-0.662233,-1.387984,-1.223542,0.805057,-1.246633,1.0,0.0,0.0
7996,-1.881477,0.296500,1.373400,1.679624,0.805057,1.255240,0.0,1.0,0.0
7997,-1.191062,-0.086993,1.028227,-1.223542,0.805057,0.629608,1.0,0.0,0.0
7998,0.117636,-1.620967,1.373400,-1.223542,0.805057,-0.971329,1.0,0.0,0.0


In [13]:
train_set_prepared = pd.DataFrame(train_set_prepared)
test_set_prepared = pd.DataFrame(test_set_prepared)

In [14]:
train_set_copy["Gender"] = train_set_copy["Gender"].astype(float)
train_set_copy["HasCrCard"] = train_set_copy["HasCrCard"].astype(float)
train_set_copy["IsActiveMember"] = train_set_copy["IsActiveMember"].astype(float)

test_set_copy["Gender"] = test_set_copy["Gender"].astype(float)
test_set_copy["HasCrCard"] = test_set_copy["HasCrCard"].astype(float)
test_set_copy["IsActiveMember"] = test_set_copy["IsActiveMember"].astype(float)


train_set_prepared[9] = train_set_copy["Gender"].values
train_set_prepared[10] = train_set_copy["HasCrCard"].values
train_set_prepared[11] = train_set_copy["IsActiveMember"].values

test_set_prepared[9] = test_set_copy["Gender"].values
test_set_prepared[10] = test_set_copy["HasCrCard"].values
test_set_prepared[11] = test_set_copy["IsActiveMember"].values

train_set_prepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.405724,4.419055,1.718573,-1.223542,0.805057,0.281060,1.0,0.0,0.0,0.0,1.0,1.0
1,-1.716602,-0.374613,0.683054,0.916236,-0.912397,-1.267964,0.0,0.0,1.0,0.0,1.0,1.0
2,0.880184,-1.141600,-0.007292,-0.399991,-0.912397,1.593040,1.0,0.0,0.0,1.0,0.0,1.0
3,0.189769,1.063487,-0.352465,1.169512,2.522511,0.425710,1.0,0.0,0.0,1.0,1.0,1.0
4,-0.954053,0.104754,0.337881,-1.223542,0.805057,1.275659,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-1.448679,-0.662233,-1.387984,-1.223542,0.805057,-1.246633,1.0,0.0,0.0,0.0,0.0,1.0
7996,-1.881477,0.296500,1.373400,1.679624,0.805057,1.255240,0.0,1.0,0.0,1.0,1.0,0.0
7997,-1.191062,-0.086993,1.028227,-1.223542,0.805057,0.629608,1.0,0.0,0.0,0.0,1.0,0.0
7998,0.117636,-1.620967,1.373400,-1.223542,0.805057,-0.971329,1.0,0.0,0.0,0.0,1.0,1.0


In [15]:
train_set_prepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.405724,4.419055,1.718573,-1.223542,0.805057,0.281060,1.0,0.0,0.0,0.0,1.0,1.0
1,-1.716602,-0.374613,0.683054,0.916236,-0.912397,-1.267964,0.0,0.0,1.0,0.0,1.0,1.0
2,0.880184,-1.141600,-0.007292,-0.399991,-0.912397,1.593040,1.0,0.0,0.0,1.0,0.0,1.0
3,0.189769,1.063487,-0.352465,1.169512,2.522511,0.425710,1.0,0.0,0.0,1.0,1.0,1.0
4,-0.954053,0.104754,0.337881,-1.223542,0.805057,1.275659,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-1.448679,-0.662233,-1.387984,-1.223542,0.805057,-1.246633,1.0,0.0,0.0,0.0,0.0,1.0
7996,-1.881477,0.296500,1.373400,1.679624,0.805057,1.255240,0.0,1.0,0.0,1.0,1.0,0.0
7997,-1.191062,-0.086993,1.028227,-1.223542,0.805057,0.629608,1.0,0.0,0.0,0.0,1.0,0.0
7998,0.117636,-1.620967,1.373400,-1.223542,0.805057,-0.971329,1.0,0.0,0.0,0.0,1.0,1.0


In [16]:
train_set_copy["HasCrCard"].isnull().values.any()

False

# Select a model and train it
### 1. Selecting a model and training it

In [17]:
train_boolean_exited = (labels_train == 1)
test_boolean_exited = (labels_test == 1)

In [18]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score

# Instantiate
rf = RandomForestClassifier()   
# Fit
rf_model = rf.fit(train_set_prepared, labels_train)
# training accuracy 99.74%
rf_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(rf_model.predict(test_set_prepared))
probs = pd.DataFrame(rf_model.predict_proba(test_set_prepared))

# Store metrics
rf_accuracy = metrics.accuracy_score(labels_test, predicted)     
rf_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
rf_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
rf_classification_report = metrics.classification_report(labels_test, predicted)
rf_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
rf_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
rf_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

# Evaluate the model using 10-fold cross-validation
rf_cv_scores = cross_val_score(RandomForestClassifier(), test_set_prepared, labels_test, scoring='precision', cv=10)
rf_cv_mean = np.mean(rf_cv_scores)

In [19]:
rf_accuracy

0.8765

In [20]:
rf_confus_matrix

array([[1545,   63],
       [ 184,  208]], dtype=int64)

In [21]:
rf_cv_scores

array([0.83333333, 0.7       , 0.66666667, 0.875     , 0.8       ,
       0.7       , 0.85714286, 0.95      , 0.82758621, 0.8       ])

In [22]:
rf_cv_mean

0.8009729064039408

In [23]:
rf_precision

0.7675276752767528

In [24]:
rf_recall

0.5306122448979592

In [25]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression()
# Fit
logit_model = logit_model.fit(train_set_prepared, labels_train)
# How accurate?
logit_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(logit_model.predict(test_set_prepared))
probs = pd.DataFrame(logit_model.predict_proba(test_set_prepared))

# Store metrics
logit_accuracy = metrics.accuracy_score(labels_test, predicted)     
logit_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
logit_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
logit_classification_report = metrics.classification_report(labels_test, predicted)
logit_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
logit_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
logit_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

logit_cv_scores = cross_val_score(LogisticRegression(), test_set_prepared, labels_test, scoring='precision', cv=10)
logit_cv_mean = np.mean(logit_cv_scores)

In [26]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate with a max depth of 3
tree_model = DecisionTreeClassifier(max_depth=3) 
# Fit a decision tree
tree_model = tree_model.fit(train_set_prepared, labels_train)
# Training accuracy
tree_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(tree_model.predict(test_set_prepared))
probs = pd.DataFrame(tree_model.predict_proba(test_set_prepared))

# Store metrics
tree_accuracy = metrics.accuracy_score(labels_test, predicted)     
tree_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
tree_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
tree_classification_report = metrics.classification_report(labels_test, predicted)
tree_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
tree_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
tree_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

tree_cv_scores = cross_val_score(DecisionTreeClassifier(max_depth=3), test_set_prepared, labels_test, scoring='precision', cv=10)
tree_cv_mean = np.mean(tree_cv_scores)

In [27]:
from sklearn.svm import SVC# Instantiate
svm_model = SVC(probability=True)
# Fit
svm_model = svm_model.fit(train_set_prepared, labels_train)
# Accuracy
svm_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(svm_model.predict(test_set_prepared))
probs = pd.DataFrame(svm_model.predict_proba(test_set_prepared))

# Store metrics
svm_accuracy = metrics.accuracy_score(labels_test, predicted)     
svm_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
svm_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
svm_classification_report = metrics.classification_report(labels_test, predicted)
svm_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
svm_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
svm_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

svm_cv_scores = cross_val_score(SVC(probability=True), test_set_prepared, labels_test, scoring='precision', cv=10)
svm_cv_mean = np.mean(svm_cv_scores)

In [28]:
from sklearn.neighbors import KNeighborsClassifier
# instantiate learning model (k = 3)
knn_model = KNeighborsClassifier(n_neighbors=3)
# fit the model
knn_model.fit(train_set_prepared, labels_train)
# Accuracy
knn_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(knn_model.predict(test_set_prepared))
probs = pd.DataFrame(knn_model.predict_proba(test_set_prepared))

# Store metrics
knn_accuracy = metrics.accuracy_score(labels_test, predicted)     
knn_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
knn_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
knn_classification_report = metrics.classification_report(labels_test, predicted)
knn_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
knn_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
knn_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

knn_cv_scores = cross_val_score(KNeighborsClassifier(n_neighbors=3), test_set_prepared, labels_test, scoring='precision', cv=10)
knn_cv_mean = np.mean(knn_cv_scores)

In [29]:
from sklearn.naive_bayes import GaussianNB
# Instantiate
bayes_model = GaussianNB()
# Fit the model
bayes_model.fit(train_set_prepared, labels_train)
# Accuracy
bayes_model.score(train_set_prepared, labels_train)

# Predictions/probs on the test dataset
predicted = pd.DataFrame(bayes_model.predict(test_set_prepared))
probs = pd.DataFrame(bayes_model.predict_proba(test_set_prepared))

# Store metrics
bayes_accuracy = metrics.accuracy_score(labels_test, predicted)     
bayes_roc_auc = metrics.roc_auc_score(labels_test, probs[1])       
bayes_confus_matrix = metrics.confusion_matrix(labels_test, predicted) 
bayes_classification_report = metrics.classification_report(labels_test, predicted)
bayes_precision = metrics.precision_score(labels_test, predicted, pos_label=1)
bayes_recall = metrics.recall_score(labels_test, predicted, pos_label=1)
bayes_f1 = metrics.f1_score(labels_test, predicted, pos_label=1)

bayes_cv_scores = cross_val_score(GaussianNB(), test_set_prepared, labels_test, scoring='precision', cv=10)
bayes_cv_mean = np.mean(bayes_cv_scores)

In [30]:
# Model comparison
models = pd.DataFrame({
  'Model': ['Logistic', 'd.Tree', 'r.f.', 'SVM', 'kNN',  'Bayes'],
  'Accuracy' : [logit_accuracy, tree_accuracy, rf_accuracy, svm_accuracy, knn_accuracy, bayes_accuracy],
  'Precision': [logit_precision, tree_precision, rf_precision, svm_precision, knn_precision, bayes_precision],
  'recall' : [logit_recall, tree_recall, rf_recall, svm_recall, knn_recall, bayes_recall],
  'F1' : [logit_f1, tree_f1, rf_f1, svm_f1, knn_f1, bayes_f1],
  'cv_precision' : [logit_cv_mean, tree_cv_mean, rf_cv_mean, svm_cv_mean, knn_cv_mean, bayes_cv_mean]
})
# Print table and sort by test precision
models.sort_values(by='Precision', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,recall,F1,cv_precision
1,d.Tree,0.856,0.90625,0.295918,0.446154,0.796881
3,SVM,0.8795,0.868293,0.454082,0.596315,0.818375
2,r.f.,0.8765,0.767528,0.530612,0.627451,0.800973
0,Logistic,0.823,0.633803,0.229592,0.337079,0.615885
4,kNN,0.8385,0.603604,0.512755,0.554483,0.583024
5,Bayes,0.8255,0.577617,0.408163,0.478326,0.569988
