# Capstone Project 2 - Part 2 - Machine Learning

In [1]:
import pandas as pd
import datetime
import numpy as np
import gzip
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from xgboost import XGBClassifier


  from numpy.core.umath_tests import inner1d


In [2]:
#Cleaned Machine Learning ready data from part 1
MS_Data = {}
MS_Data = pd.read_csv(r'''C:\Users\Charles\Desktop\Data Science\Capstone Project 2\MS_Data.csv''')

In [3]:
r, c = MS_Data.shape
print(r, c)

351775 14


In [4]:
MS_Data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Loan_Amount,Debt_To_Income_Ratio,Employment_Length,Normalized_Risk_Score,Approved_Reject
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.306,6,-0.005376,0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,0.0,1,-0.848291,0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,500.0,0.0575,2,-0.723834,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.0431,1,-0.83132,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,0.0904,2,0.384967,0


In [5]:
# Split the data into a training and test set.
X = MS_Data.drop('Approved_Reject',axis=1).values
y = (MS_Data['Approved_Reject']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

# Baseline
Assumes anything over 40% debt to income ratio or Employment_Length < 1year to be rejected and anything else to be approved

In [6]:
#print out data in easy to read format
np.set_printoptions(suppress=True)
X_test[0]

array([   0.       ,    0.       ,    0.       ,    0.       ,
          0.       ,    0.       ,    0.       ,    1.       ,
          0.       , 4000.       ,    0.1747   ,    0.       ,
         -0.0665831])

In [7]:
#implement baseline formula
baseline = []
baseline = np.where(X_test[:,10] > 0.4, 0, np.where(X_test[:,11] == 0, 0, 1))

In [8]:
print(baseline.shape)
print(X_test.shape)


(70355,)
(70355, 13)


In [9]:
print(classification_report(y_test, baseline))

             precision    recall  f1-score   support

          0       0.96      0.90      0.93     49228
          1       0.80      0.91      0.85     21127

avg / total       0.91      0.90      0.90     70355



In [10]:
print(accuracy_score(baseline, y_test))
#very high accuracy

0.9028924738824533


In [None]:
#The results are actually very strong using the baseline method. Lets see if Machine Learning methods can imporve on this 
#existing strong prediction ability. 

In [11]:
print("Confusion Matrix")
print(confusion_matrix(y_test, baseline,labels=[1,0]))

Confusion Matrix
[[19150  1977]
 [ 4855 44373]]


# Logistic Regression

In [12]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
LR_predictions = logmodel.predict(X_test)

In [13]:
print(classification_report(y_test,LR_predictions))
#Recall = TP/(TP+FN)
#Precision = TP/(TP+FP)
#low score across the board. lower than baseline

             precision    recall  f1-score   support

          0       0.85      0.93      0.89     49228
          1       0.80      0.62      0.70     21127

avg / total       0.83      0.84      0.83     70355



In [14]:
print(accuracy_score(LR_predictions, y_test))
#accuracy = (TP+FN)/Total Num of Samples

0.8382630943074408


In [None]:
#the performance is actually significantly worse than baseline where no machine learning was used. 
#the only thing LR performed better in is Negative Recall

In [15]:
roc_auc_score(y_test, logmodel.predict_proba(X_test)[:, 1])
#second method to get AUC score. without crossvalidation

0.9108803248709033

In [16]:
#cross_val_score by default does not shuffle when applying stratification sampling. Raw data is sorted by time series and labeling
#so random shuffle is important to generate consistent result
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
LR_cv_score = cross_val_score(logmodel,X, y, cv=cv,scoring='roc_auc')

In [35]:
print("All AUC Scores")
print(LR_cv_score)
print('\n')
print("AUC Score")
print(LR_cv_score.mean())

All AUC Scores
[0.91160825 0.91364171 0.91200162]


AUC Score
0.9124171935405014


In [None]:
#the AUC score is actually very strong for Logistic Regression but we can do better with other models.

In [18]:
print("Confusion Matrix")
print(confusion_matrix(y_test, LR_predictions,labels=[1,0]))
#left label: predicted class
#top label: actual class

Confusion Matrix
[[13083  8044]
 [ 3335 45893]]


# Random Forest

In [19]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
RF_predictions = rf.predict(X_test)

In [20]:
print(classification_report(y_test,RF_predictions))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97     49228
          1       0.92      0.92      0.92     21127

avg / total       0.95      0.95      0.95     70355



In [21]:
print(accuracy_score(RF_predictions, y_test))
#much higher accuracy than baseline

0.952270627531803


In [22]:

roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

0.9897547431341185

In [23]:
#AUC curve the is prefered choice over accuracy for binary classification

cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rfc_cv_score = cross_val_score(rf,X, y, cv=cv,scoring='roc_auc')



In [36]:
print("All AUC Scores")
print(rfc_cv_score)
print('\n')
print("AUC Score")
print(rfc_cv_score.mean())

All AUC Scores
[0.98956646 0.98970648 0.98935538]


AUC Score
0.9895427738524619


In [25]:
print("Confusion Matrix")
print(confusion_matrix(y_test, RF_predictions,labels=[1,0]))
#left label: predicted class
#top label: actual class

Confusion Matrix
[[19483  1644]
 [ 1714 47514]]


In [None]:
# This is noticably better in all four quandrants compared to logistic regression.
#True positive and True Negative are both much higher while False Positive and False Negative are lower

In [26]:
#create list of features
feature_list= list(MS_Data.drop('Approved_Reject',axis=1).columns)
print(feature_list)

['0', '1', '2', '3', '4', '5', '6', '7', '8', 'Loan_Amount', 'Debt_To_Income_Ratio', 'Employment_Length', 'Normalized_Risk_Score']


In [27]:
#create importance metric for the features
importances = list(rf.feature_importances_)
round_importance = np.round(importances,5)
print(round_importance)

[0.00086 0.00077 0.00076 0.00037 0.0006  0.00059 0.00055 0.00118 0.0008
 0.08303 0.12298 0.55828 0.22923]


In [28]:
feature_importances = [(feature, round_importance) for feature, round_importance in zip(feature_list, round_importance)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

#as expected employment length takes priority but the next 3 features are also quite important.
#the regions are shown to be not important at all for prediction purpose.

Variable: Employment_Length    Importance: 0.55828
Variable: Normalized_Risk_Score Importance: 0.22923
Variable: Debt_To_Income_Ratio Importance: 0.12298
Variable: Loan_Amount          Importance: 0.08303
Variable: 7                    Importance: 0.00118
Variable: 0                    Importance: 0.00086
Variable: 8                    Importance: 0.0008
Variable: 1                    Importance: 0.00077
Variable: 2                    Importance: 0.00076
Variable: 4                    Importance: 0.0006
Variable: 5                    Importance: 0.00059
Variable: 6                    Importance: 0.00055
Variable: 3                    Importance: 0.00037


# XGboost

In [29]:
XG_model = XGBClassifier()
XG_model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [30]:
XG_predictions = XG_model.predict(X_test)

  if diff:


In [31]:
print(classification_report(y_test,XG_predictions))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97     49228
          1       0.93      0.92      0.92     21127

avg / total       0.95      0.95      0.95     70355



In [None]:
# Results are so close to Random Forest that at two decimal places, Precision, Recall, F1-Score, and Support are identical
#between XGBoost and Random Forest.

In [38]:
print("Confusion Matrix")
print(confusion_matrix(y_test, XG_predictions,labels=[1,0]))
#left label: predicted class
#top label: actual class

Confusion Matrix
[[19405  1722]
 [ 1449 47779]]


In [None]:
#True Positive is actually slightly lower for XGBoost relative to Random Forest but True Negative is better for ]
#XGBoost which is why it has slightly higher AUC score seen below

In [37]:
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
XG_cv_score = cross_val_score(XG_model,X, y, cv=cv,scoring='roc_auc')
print("All AUC Scores")
print(XG_cv_score)
print('\n')
print("AUC Score")
print(XG_cv_score.mean())

All AUC Scores
[0.98907817 0.99026786 0.98940638]


AUC Score
0.9895841352306004


In [None]:
# The AUC score for XGBoost is essentially the same as Random Forest but is significantly better than logistic regression. 

In [34]:
XG_importances = list(XG_model.feature_importances_)
XG_round_importance = np.round(XG_importances,5)
XG_feature_importances = [(feature, XG_round_importance) for feature, XG_round_importance in zip(feature_list, XG_round_importance)]
XG_feature_importances = sorted(XG_feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in XG_feature_importances];

#compared to random forest, the focus on Employment_Length is significantly higher. 
#All other features have been reduced in importance
#The Regions are completely removed under XGBoost. In random forest there is still some importance in prediction. 

Variable: Employment_Length    Importance: 0.817799985408783
Variable: Normalized_Risk_Score Importance: 0.094030000269413
Variable: Debt_To_Income_Ratio Importance: 0.05299000069499016
Variable: Loan_Amount          Importance: 0.035179998725652695
Variable: 0                    Importance: 0.0
Variable: 1                    Importance: 0.0
Variable: 2                    Importance: 0.0
Variable: 3                    Importance: 0.0
Variable: 4                    Importance: 0.0
Variable: 5                    Importance: 0.0
Variable: 6                    Importance: 0.0
Variable: 7                    Importance: 0.0
Variable: 8                    Importance: 0.0


In [None]:
#Given how strong XGBoost and Random Forest performance scores are, there is no need to try out other types of model.
#They are noticably better than the baseline method and suffice for the purpose of this project. 