### Import Necessary Libraries

In [1]:
## Data Analysis
import numpy as np
import pandas as pd

## Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

### Load the Dataset

In [2]:
df = pd.read_csv('onehot.csv')
df.head()

Unnamed: 0,Gender,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,...,Card_Category_Silver,Customer_Age,Months_on_book,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio,Attrition_Flag
0,1,3,5,1,3,0,0,1,0,0,...,0,0.5,0.604651,0.34019,0.308701,0.345116,0.035273,0.248062,0.061061,1
1,0,5,6,1,2,0,1,0,0,0,...,0,0.605263,0.72093,0.206112,0.343266,0.214093,0.043452,0.178295,0.105105,1
2,1,3,4,1,0,0,1,0,0,0,...,0,0.657895,0.534884,0.05985,0.0,0.098948,0.076611,0.077519,0.0,1
3,0,4,3,4,1,0,0,1,0,0,...,0,0.368421,0.488372,0.056676,1.0,0.022977,0.036775,0.077519,0.760761,1
4,1,3,5,1,0,0,0,0,0,1,...,0,0.368421,0.186047,0.099091,0.0,0.136557,0.017025,0.139535,0.0,1


In [3]:
print(f"The Data has {df.shape[0]} rows and {df.shape[1]} columns")

The Data has 10016 rows and 28 columns


### Model Building

In [4]:
X = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']

#### Split the Dataset

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [6]:
print(len(X_train), len(X_test))

8012 2004


### Handling Imbalanced Data

In [7]:
# !pip install imblearn

In [8]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [9]:
os= SMOTETomek(0.75)
Xtrain_bal, ytrain_bal = os.fit_resample(X, y)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(ytrain_bal)))

The number of classes before fit Counter({1: 6724, 0: 1288})
The number of classes after fit Counter({1: 8365, 0: 6265})


In [10]:
### CHECK % FOR TRAIN ANS TEST CLASSES
print(ytrain_bal.value_counts())
print()
print(y_test.value_counts())

1    8365
0    6265
Name: Attrition_Flag, dtype: int64

1    1676
0     328
Name: Attrition_Flag, dtype: int64


#### Algorithms
From here, we will be running the following algorithms.

- Logistic Regression
- KNN
- Naive Bayes
- Stochastic Gradient Decent
- Linear SVC
- Decision Tree
- Gradient Boosted Trees
- Random Forest

In any model building, we mainly focus on 3 main steps:

1. Fitting the model and finding the accuracy (accuracy score) of the fitted model.
2. Perform K-Fold Cross Validation (K needs to be specified).
3. Find the accuracy of the Cross Validation.

In [11]:
# Python Imports
import math,time,random,datetime

# Machine Learning
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix

In [12]:
# For our simplicity let us create a function that we can call for each model.
def ML_algo(algo, X_train, y_train, cv):
    
    ## Step 1:
    model = algo.fit(X_train, y_train)                          # Creating the model. We will fit the algorithm to the training data.
    accuracy = round(model.score(X_train, y_train)*100, 2)

    ## Step 2:  --> This code performs Cross Validation automatically.
    train_pred = model_selection.cross_val_predict(algo, X_train, y_train, cv= cv, n_jobs= -1)

    
    ## Step 3:  --> Cross Validation accuracy metric.
    accuracy_cv = round(metrics.accuracy_score(y_train, train_pred)*100, 2)

    ## Step 4:  --> Confusion Matrix
    cm = confusion_matrix(y_train, train_pred)
    
    return train_pred, accuracy, accuracy_cv, cm

##### Model 1: Logistic Regression

In [13]:
# Logistic Regression
start_time = time.time()
log_train_pred, log_acc, log_acc_cv, log_cm= ML_algo(LogisticRegression(), Xtrain_bal, ytrain_bal, 10)

log_time = (time.time()- start_time)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
print()
print('Accuracy of the model is: ', log_acc)
print('Accuracy of 10-Fold CV is: ', log_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= log_time))
print()
print('Confusion Matrix: ')
print(log_cm)


Accuracy of the model is:  85.78
Accuracy of 10-Fold CV is:  81.77
Running time is:  0:00:11.102158

Confusion Matrix: 
[[5200 1065]
 [1602 6763]]


##### Model 2: K-Nearest Neighbours

In [15]:
# K-Nearest Neighbours
start_time = time.time()
knn_train_pred, knn_acc, knn_acc_cv, knn_cm = ML_algo(KNeighborsClassifier(), Xtrain_bal, ytrain_bal, 10)

knn_time = (time.time()- start_time)

print('Accuracy of the model is: ', knn_acc)
print('Accuracy of 10-Fold CV is: ', knn_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= knn_time))
print()
print('Confusion Matrix: ')
print(knn_cm)

Accuracy of the model is:  89.64
Accuracy of 10-Fold CV is:  81.33
Running time is:  0:00:19.620282

Confusion Matrix: 
[[5874  391]
 [2341 6024]]


##### Model 3: Gaussian Naive Bayes

In [16]:
# Gaussian Naive Bayes
start_time = time.time()
gnb_train_pred, gnb_acc, gnb_acc_cv, gnb_cm = ML_algo(GaussianNB(), Xtrain_bal, ytrain_bal, 10)

gnb_time = (time.time()- start_time)

print('Accuracy of the model is: ', gnb_acc)
print('Accuracy of 10-Fold CV is: ', gnb_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= gnb_time))
print()
print('Confusion Matrix: ')
print(gnb_cm)

Accuracy of the model is:  73.49
Accuracy of 10-Fold CV is:  70.2
Running time is:  0:00:00.365621

Confusion Matrix: 
[[5350  915]
 [3445 4920]]


##### Model 4: Linear Support Vector Machines (SVC)

In [17]:
# Support Vector Machines
start_time = time.time()
svm_train_pred, svm_acc, svm_acc_cv, svm_cm = ML_algo(LinearSVC(), Xtrain_bal, ytrain_bal, 10)

svm_time = (time.time()- start_time)



In [18]:
print()
print('Accuracy of the model is: ', svm_acc)
print('Accuracy of 10-Fold CV is: ', svm_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= svm_time))
print()
print('Confusion Matrix: ')
print(svm_cm)


Accuracy of the model is:  85.76
Accuracy of 10-Fold CV is:  80.96
Running time is:  0:00:13.538316

Confusion Matrix: 
[[5239 1026]
 [1760 6605]]


##### Model 5: Stochastic Gradient Descent

In [19]:
# Stochastic Gradient Descent
start_time = time.time()
sgd_train_pred, sgd_acc, sgd_acc_cv, sgd_cm = ML_algo(SGDClassifier(), Xtrain_bal, ytrain_bal, 10)

sgd_time = (time.time()- start_time)

print('Accuracy of the model is: ', sgd_acc)
print('Accuracy of 10-Fold CV is: ', sgd_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= sgd_time))
print()
print('Confusion Matrix: ')
print(sgd_cm)

Accuracy of the model is:  85.95
Accuracy of 10-Fold CV is:  81.59
Running time is:  0:00:01.233890

Confusion Matrix: 
[[5052 1213]
 [1480 6885]]


##### Model 6: Decision Tree Classifier

In [20]:
# Decision Tree Classifier
start_time = time.time()
dtc_train_pred, dtc_acc, dtc_acc_cv, dtc_cm = ML_algo(DecisionTreeClassifier(), Xtrain_bal, ytrain_bal, 10)

dtc_time = (time.time()- start_time)

print('Accuracy of the model is: ', dtc_acc)
print('Accuracy of 10-Fold CV is: ', dtc_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= dtc_time))
print()
print('Confusion Matrix: ')
print(dtc_cm)

Accuracy of the model is:  100.0
Accuracy of 10-Fold CV is:  89.38
Running time is:  0:00:01.095675

Confusion Matrix: 
[[5735  530]
 [1024 7341]]


##### Model 7: Gradient Boost Trees

In [21]:
# Gradient Boost Trees
start_time = time.time()
gbt_train_pred, gbt_acc, gbt_acc_cv, gbt_cm = ML_algo(GradientBoostingClassifier(), Xtrain_bal, ytrain_bal, 10)

gbt_time = (time.time()- start_time)

print('Accuracy of the model is: ', gbt_acc)
print('Accuracy of 10-Fold CV is: ', gbt_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= gbt_time))
print()
print('Confusion Matrix: ')
print(gbt_cm)

Accuracy of the model is:  94.88
Accuracy of 10-Fold CV is:  91.78
Running time is:  0:00:20.455931

Confusion Matrix: 
[[5832  433]
 [ 769 7596]]


##### Model 8: Random Forest

In [22]:
# Random Forest
start_time = time.time()
algorithm1 = RandomForestClassifier()

## Step 1:
modelRF = algorithm1.fit(Xtrain_bal, ytrain_bal)      # Creating the model. We will fit the algorithm to the training data.
rf_acc = round(modelRF.score(Xtrain_bal, ytrain_bal)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
rf_train_pred = model_selection.cross_val_predict(algorithm1, Xtrain_bal, ytrain_bal, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
rf_acc_cv = round(metrics.accuracy_score(ytrain_bal, rf_train_pred)*100, 2)

rf_cm = confusion_matrix(ytrain_bal, rf_train_pred)

rf_time = (time.time()- start_time)

In [23]:
print('Accuracy of the model is: ', rf_acc)
print('Accuracy of 10-Fold CV is: ', rf_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= rf_time))
print()
print('Confusion Matrix: ')
print(rf_cm)

Accuracy of the model is:  100.0
Accuracy of 10-Fold CV is:  93.42
Running time is:  0:00:12.756637

Confusion Matrix: 
[[6002  263]
 [ 700 7665]]


##### Model 9: XGBoost Classifier

In [24]:
# conda install py-xgboost

In [25]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [26]:
start_time = time.time()
algorithm2 = XGBClassifier()

## Step 1:
modelXGB = algorithm2.fit(Xtrain_bal, ytrain_bal)      # Creating the model. We will fit the algorithm to the training data.
xgb_acc = round(modelXGB.score(Xtrain_bal, ytrain_bal)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
xgb_train_pred = model_selection.cross_val_predict(algorithm2, Xtrain_bal, ytrain_bal, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
xgb_acc_cv = round(metrics.accuracy_score(ytrain_bal, xgb_train_pred)*100, 2)

xgb_cm = confusion_matrix(ytrain_bal, xgb_train_pred)

xgb_time = (time.time()- start_time)















In [27]:
print('Accuracy of the model is: ', xgb_acc)
print('Accuracy of 10-Fold CV is: ', xgb_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= xgb_time))
print()
print('Confusion Matrix: ')
print(xgb_cm)

Accuracy of the model is:  99.82
Accuracy of 10-Fold CV is:  94.33
Running time is:  0:00:26.423745

Confusion Matrix: 
[[5981  284]
 [ 545 7820]]


### Model Results
Now let's see which model has the best cross-validation accuracy.

- **NOTE: We care more about the accuracy of cross validation, as the metrics we get from the model can randomly score higher than usual.**

In [28]:
cv_models = pd.DataFrame({'Model':[' Logistic Regression', 'K-Nearest Neighbours', 'Gaussian Naive Bayes', 
                                'Linear Support Vector Machines (SVC)', 'Stochastic Gradient Descent', 
                                'Decision Tree Classifier', 'Gradient Boost Trees', 'Random Forest', 'XGBoost'],
                      'Score':[log_acc_cv, knn_acc_cv, gnb_acc_cv, svm_acc_cv, sgd_acc_cv, dtc_acc_cv, gbt_acc_cv, rf_acc_cv, xgb_acc_cv]})

print('-----Cross-Validation Accuracy Scores-----')
cv_models.nlargest(9,'Score')

-----Cross-Validation Accuracy Scores-----


Unnamed: 0,Model,Score
8,XGBoost,94.33
7,Random Forest,93.42
6,Gradient Boost Trees,91.78
5,Decision Tree Classifier,89.38
0,Logistic Regression,81.77
4,Stochastic Gradient Descent,81.59
1,K-Nearest Neighbours,81.33
3,Linear Support Vector Machines (SVC),80.96
2,Gaussian Naive Bayes,70.2


### Prediction on the Test dataset

Let's use the model with the highest cross-validation accuracy score to make a prediction on the test dataset.

We want to make predictions on the same columnns our model is trained on.

So we have to select the subset of right columns of the test dateframe, encode them and make a prediciton with our model.

In [29]:
modelXGB

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
## Prediction on Validation Data
y_pred = modelXGB.predict(X_test)

print('Accuracy of the model is: ', round(metrics.accuracy_score(y_test, y_pred)*100,2))
print()
print('Precision of the model is: ', round(metrics.precision_score(y_test, y_pred)*100,2))
print('Recall of the model is: ', round(metrics.recall_score(y_test, y_pred)*100,2))
print('F1 Score of the model is: ', round(metrics.f1_score(y_test, y_pred)*100,2))

Accuracy of the model is:  99.6

Precision of the model is:  99.88
Recall of the model is:  99.64
F1 Score of the model is:  99.76


In [31]:
confusion_matrix(y_test, y_pred)

array([[ 326,    2],
       [   6, 1670]])