In [1]:
import pandas as pd
import datetime
import numpy as np
import gzip
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from xgboost import XGBClassifier


  from numpy.core.umath_tests import inner1d


In [2]:
#Cleaned Machine Learning ready data
MS_Data = {}
MS_Data = pd.read_csv(r'''C:\Users\Charles\Desktop\Data Science\Capstone Project 2\MS_Data.csv''')

In [3]:
r, c = MS_Data.shape
print(r, c)

351775 14


In [4]:
MS_Data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Loan_Amount,Debt_To_Income_Ratio,Employment_Length,Normalized_Risk_Score,Approved_Reject
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.0,1,-3.33862,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.0,11,-0.272743,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.0231,2,0.901663,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,500.0,0.0893,7,-0.210635,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,500.0,0.0658,4,-0.865592,0


In [5]:
# Split the data into a training and test set.
X = MS_Data.drop('Approved_Reject',axis=1).values
y = (MS_Data['Approved_Reject']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

# Baseline
Assumes anything over 40% debt to income ratio to be rejected and anything under to be approved

In [6]:
np.set_printoptions(suppress=True)
X_test[0]

array([   1.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        , 4000.        ,    0.1043    ,    0.        ,
         -0.13723424])

In [7]:
baseline = []
baseline = np.where(X_test[:,10] > 0.4, 0, 1)

In [8]:
print(baseline.shape)
print(X_test.shape)


(70355,)
(70355, 13)


In [9]:
print(classification_report(y_test, baseline))

             precision    recall  f1-score   support

          0       0.98      0.09      0.17     49228
          1       0.32      1.00      0.48     21127

avg / total       0.78      0.36      0.26     70355



In [10]:
print(accuracy_score(baseline, y_test))

0.3624760144979035


# Logistic Regression

In [11]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
LR_predictions = logmodel.predict(X_test)

In [12]:
print(classification_report(y_test,LR_predictions))
#Recall = TP/(TP+FN)
#Precision = TP/(TP+FP)

             precision    recall  f1-score   support

          0       0.85      0.94      0.89     49228
          1       0.81      0.60      0.69     21127

avg / total       0.83      0.84      0.83     70355



In [13]:
print(accuracy_score(LR_predictions, y_test))
#accuracy = (TP+FN)/Total Num of Samples

0.8371828583611683


In [14]:
roc_auc_score(y_test, logmodel.predict_proba(X_test)[:, 1])

0.9110959329335613

In [15]:
#cross_val_score by default does not shuffle when applying stratification sampling. Raw data is sorted by time series and labeling
#so random shuffle is important to generate consistent result
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
LR_cv_score = cross_val_score(logmodel,X, y, cv=cv,scoring='roc_auc')

In [16]:
print("All AUC Scores")
print(LR_cv_score)
print('\n')
print("Mean AUC Score")
print(LR_cv_score.mean())

All AUC Scores
[0.91204962 0.91086218 0.91212494]


Mean AUC Score
0.9116789131891609


In [17]:
print("Confusion Matrix")
print(confusion_matrix(y_test, LR_predictions,labels=[1,0]))
#left label: predicted class
#top label: actual class

Confusion Matrix
[[12695  8432]
 [ 3023 46205]]


# Random Forest

In [18]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
RF_predictions = rf.predict(X_test)

In [19]:
print(classification_report(y_test,RF_predictions))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97     49228
          1       0.92      0.92      0.92     21127

avg / total       0.95      0.95      0.95     70355



In [20]:
print(accuracy_score(RF_predictions, y_test))

0.9530523772297633


In [21]:

roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

0.990192611888461

In [22]:
#AUC curve is prefered choice over accuracy for binary classification

cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rfc_cv_score = cross_val_score(rf,X, y, cv=cv,scoring='roc_auc')



In [23]:
print("All AUC Scores")
print(rfc_cv_score)
print('\n')
print("Mean AUC Score")
print(rfc_cv_score.mean())

All AUC Scores
[0.98921236 0.989307   0.98900168]


Mean AUC Score
0.9891736776783754


In [24]:
print("Confusion Matrix")
print(confusion_matrix(y_test, RF_predictions,labels=[1,0]))
#left label: predicted class
#top label: actual class

Confusion Matrix
[[19449  1678]
 [ 1625 47603]]


In [25]:
feature_list= list(MS_Data.drop('Approved_Reject',axis=1).columns)
print(feature_list)

['0', '1', '2', '3', '4', '5', '6', '7', '8', 'Loan_Amount', 'Debt_To_Income_Ratio', 'Employment_Length', 'Normalized_Risk_Score']


In [26]:
importances = list(rf.feature_importances_)
round_importance = np.round(importances,5)
print(round_importance)

[0.00082 0.00077 0.00083 0.0004  0.00056 0.00058 0.00052 0.00116 0.00081
 0.08708 0.1245  0.54676 0.23519]


In [27]:
feature_importances = [(feature, round_importance) for feature, round_importance in zip(feature_list, round_importance)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Employment_Length    Importance: 0.54676
Variable: Normalized_Risk_Score Importance: 0.23519
Variable: Debt_To_Income_Ratio Importance: 0.1245
Variable: Loan_Amount          Importance: 0.08708
Variable: 7                    Importance: 0.00116
Variable: 2                    Importance: 0.00083
Variable: 0                    Importance: 0.00082
Variable: 8                    Importance: 0.00081
Variable: 1                    Importance: 0.00077
Variable: 5                    Importance: 0.00058
Variable: 4                    Importance: 0.00056
Variable: 6                    Importance: 0.00052
Variable: 3                    Importance: 0.0004


# XGboost

In [28]:
XG_model = XGBClassifier()
XG_model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [29]:
XG_predictions = XG_model.predict(X_test)

  if diff:


In [30]:
print(classification_report(y_test,XG_predictions))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97     49228
          1       0.94      0.92      0.93     21127

avg / total       0.96      0.96      0.96     70355



In [32]:
print("Confusion Matrix")
print(confusion_matrix(y_test, RF_predictions,labels=[1,0]))

Confusion Matrix
[[19449  1678]
 [ 1625 47603]]


In [37]:
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
XG_cv_score = cross_val_score(XG_model,X, y, cv=cv,scoring='roc_auc')
print("All AUC Scores")
print(XG_cv_score)
print('\n')
print("Mean AUC Score")
print(XG_cv_score.mean())

All AUC Scores
[0.98954234 0.98986778 0.98861844]


Mean AUC Score
0.9893428491953179


In [35]:
XG_importances = list(XG_model.feature_importances_)
XG_round_importance = np.round(XG_importances,5)
XG_feature_importances = [(feature, XG_round_importance) for feature, XG_round_importance in zip(feature_list, XG_round_importance)]
XG_feature_importances = sorted(XG_feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in XG_feature_importances];

Variable: Employment_Length    Importance: 0.8150200247764587
Variable: Normalized_Risk_Score Importance: 0.08987999707460403
Variable: Debt_To_Income_Ratio Importance: 0.062199998646974564
Variable: Loan_Amount          Importance: 0.03290000185370445
Variable: 0                    Importance: 0.0
Variable: 1                    Importance: 0.0
Variable: 2                    Importance: 0.0
Variable: 3                    Importance: 0.0
Variable: 4                    Importance: 0.0
Variable: 5                    Importance: 0.0
Variable: 6                    Importance: 0.0
Variable: 7                    Importance: 0.0
Variable: 8                    Importance: 0.0
