# Capstone - Loan Status Prediction Machine Learning Model

Goal: Build machine learning model(s) to predict loan status as target: if a loan will be charged-off (0), or stay current/are paid off (1).

(1827125, 145)

##  Import packages

In [3]:
# Install packages
# import sys
# !{sys.executable} -m pip install kneed

# Get path 
import os
from pathlib import Path # get path

# Data wrangling
import pandas as pd
import numpy as np 

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot # qqplot for data normality test

# Statistical testing
import scipy.stats as stats


# Reporting
from pandas_profiling import ProfileReport


# PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.decomposition import PCA

#ML model building

# Logistic regression

from sklearn.linear_model import LogisticRegression #, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score

from dmba import classificationSummary


# Decision Tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from dmba import plotDecisionTree, textDecisionTree # tree visualization












import pydotplus


no display found. Using non-interactive Agg backend


## Load dataset¶

In [2]:
# load dataset
data_path = Path('.').resolve().parents[1] / 'train_data.csv'
train = pd.read_csv(data_path)

  train = pd.read_csv(data_path)


In [3]:
train.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,8.81,317.12,A,A5,...,,,Cash,N,,,,,,
1,,,10000,10000,10000.0,60 months,27.27,306.97,E,E5,...,,,Cash,N,,,,,,
2,,,4800,4800,4800.0,36 months,16.91,170.92,C,C5,...,,,Cash,N,,,,,,
3,,,35000,35000,35000.0,36 months,14.47,1204.23,C,C2,...,,,Cash,N,,,,,,
4,,,16000,16000,15975.0,60 months,10.08,340.59,B,B1,...,,,Cash,N,,,,,,


In [4]:
train.shape

(1827125, 145)

In [5]:
na_tally = train.isna().sum().sort_values(ascending = False)

In [6]:
na_tally[na_tally>train.shape[0]*0.9]

id                                            1827125
url                                           1827125
member_id                                     1827125
orig_projected_additional_accrued_interest    1822116
hardship_length                               1821000
hardship_reason                               1821000
hardship_status                               1821000
deferral_term                                 1821000
hardship_amount                               1821000
hardship_start_date                           1821000
hardship_end_date                             1821000
payment_plan_start_date                       1821000
hardship_dpd                                  1821000
hardship_loan_status                          1821000
hardship_payoff_balance_amount                1821000
hardship_last_payment_amount                  1821000
hardship_type                                 1821000
debt_settlement_flag_date                     1810175
settlement_status           

In [7]:
trn = train.copy().drop(columns=na_tally[na_tally>train.shape[0]*0.9].index)

In [8]:
print(trn['loan_status'].unique())

trn['loan_status_dv'] = [1 if loan_status=='Fully Paid'or loan_status=='Current' else 0 for loan_status in trn['loan_status']]
print("Paid off event rate is: {}".format(trn['loan_status_dv'].mean()))

['Charged Off' 'Current' 'Fully Paid']
Paid off event rate is: 0.9269201614558391


In [9]:
trn.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       ...
       'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim',
       'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit',
       'hardship_flag', 'disbursement_method', 'debt_settlement_flag',
       'loan_status_dv'],
      dtype='object', length=108)

In [10]:
###### optional to run #########
# sample a 3_pct dataset to investigate 

# trn_sample_3pct = trn.sample(n = int(trn.shape[0]*0.03), replace = False, random_state=2) # 91356
# trn_sample_3pct.to_csv('trn_sample_3pct.csv') 

# Logistic Regression

In [5]:
data_path_sample = Path('.').resolve()/ 'trn_sample_3pct.csv'
#trn_sample = pd.read_csv(data_path_sample) use this
trn = pd.read_csv(data_path_sample) # delete after test 

## Select predictors & target - choose from EDA

In [6]:
# Select predictors
predictors = [i for i in trn.columns.tolist() if i not in ['policy_code','loan_status_dv','loan_status']]
outcome = 'loan_status_dv'

## Get train and test datasets from trn 

### Get dummy variables - edit

In [7]:
# Create two data sets for numeric and non-numeric data
trn_num = trn.select_dtypes(exclude=['object'])
trn_obj = trn.select_dtypes(include=['object'])

# One-hot encode the non-numeric columns
trn_onehot = pd.get_dummies(trn_obj)

# Union the one-hot encoded columns to the numeric ones
trn_prep = pd.concat([trn_num, trn_onehot], axis=1)

# Print the columns in the new data set
print(trn_prep.columns)

Index(['Unnamed: 0', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
       'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       ...
       'last_credit_pull_d_Sep-2017', 'last_credit_pull_d_Sep-2018',
       'application_type_Individual', 'application_type_Joint App',
       'hardship_flag_N', 'hardship_flag_Y', 'disbursement_method_Cash',
       'disbursement_method_DirectPay', 'debt_settlement_flag_N',
       'debt_settlement_flag_Y'],
      dtype='object', length=27367)


In [8]:
##### optional when using real testing data set #######
# Split within trn set
trn_prep, trn_prep_test = train_test_split(trn_prep, test_size=0.2)  # may change to smaller dataset

In [9]:
# Real dataset
data_path_test = Path('.').resolve().parents[1] / 'updated_test_data_20200728.csv'
test = pd.read_csv(data_path_test)

In [11]:
X_train = pd.get_dummies(trn_prep[predictors], prefix='', prefix_sep='', drop_first=True)
X_test = pd.get_dummies(trn_prep_test[predictors], prefix='', prefix_sep='', drop_first=True)  # use `test` for real
y_train = trn_prep[outcome] 
y_test = trn_prep_test[outcome]  # updated_test_data_20200728

KeyError: "['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'application_type', 'hardship_flag', 'disbursement_method', 'debt_settlement_flag'] not in index"

## Fit model 

In [None]:
#C=1e42, regularization term, smaller = more regularization
logit_reg = LogisticRegression(penalty='l2', C=1e42, solver='liblinear') #solver='lbfgs'
logit_reg.fit(X_train, y_train)

# Intercept and coeff
print('intercept ', logit_reg.intercept_[0])
print('classes', logit_reg.classes_)
pd.DataFrame({'coeff': logit_reg.coef_[0]}, 
             index=X_train.columns)

# Parameters of the model
print(logit_reg.get_params())

## Predicted Values from Logistic Regression
predict(): predict the actual class 

predict_proba(): predict the class probabilities

In [None]:
# Predicted probability
pred_train = pd.DataFrame(logit_reg.predict_proba(X_train),
                    columns=logit_reg.classes_)
pred_test = pd.DataFrame(logit_reg.predict_proba(X_test),
                    columns=logit_reg.classes_)

print(pred_train.describe())
print(pred_test.describe())

In [None]:
# Predicted results
logit_reg.predict(X_test)[:10]


In [None]:
# modify
preds = clf_logistic.predict_proba(X_test)
# Create dataframes of first five predictions, and first five true labels
preds_df = pd.DataFrame(preds[:,1][0:5], columns = ['prob_default'])
true_df = y_test.head(5)

# Concatenate and print the two data frames for comparison
print(pd.concat([true_df.reset_index(drop = True), preds_df], axis = 1))

## Confusion matrix

In [None]:
pred_y = logit_reg.predict(X_test)

In [None]:
confusion_matrix(y_test, pred_y)

In [None]:
classificationSummary(y_test, pred_y, 
                      class_names=logit_reg.classes_)   

## Summary

### Score

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, logit_reg.predict(X_test)).ravel()

print('Accuracy: {}'.format((tn+tp)/(tn+fp+fn+tp)))
print('Precision:   {}'.format(tp/(fp+tp))) 
print('Recall:     {}'.format(tp/(tp+fn)))
print('Specificity: {}'.format(tn/(tn+fp)))

In [None]:
# precision, recall, fbeta_score and support
precision_recall_fscore_support(y_test, logit_reg2.predict(X_test), 
                                labels=[0, 1])

In [None]:
# precision    recall  f1-score   support
target_names = ['paid off', 'default']
true_y = y_test # updated_test_data_20200728  
print(classification_report(true_y, pred_y, target_names=target_names))

In [None]:
# F1 = 2 * Precision* Recall/(Precision + Recall)
print('F1 SCORE for Default:  {}'.format(2*(tp/(fp+tp))*(tp/(tp+fn))/((tp/(fp+tp))+(tp/(tp+fn)))))

### ROC, AUC

In [None]:
# roc_auc_score
print(roc_auc_score(y_test, (logit_reg.predict_proba(X_test)[:, 1])))

In [None]:
# ROC
fpr, tpr, thresholds = roc_curve(y_test, (logit_reg.predict_proba(X_test)[:, 1]), 
                                 pos_label=1)
roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr})

ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1)
ax.set_xlim(1, 0)
ax.plot((1, 0), (0, 1))
ax.set_xlabel('specificity')
ax.set_ylabel('recall')

plt.tight_layout()

In [None]:
# AUC
fpr, tpr, thresholds = roc_curve(y_test, (logit_reg.predict_proba(X_test)[:,0]), 
                                 pos_label=1)
roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr})

ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1)
ax.set_xlim(1, 0)
# ax.plot((1, 0), (0, 1))
ax.set_xlabel('specificity')
ax.set_ylabel('recall')
ax.fill_between(roc_df.specificity, 0, roc_df.recall, alpha=0.3)

plt.tight_layout()

In [None]:

# Create a dataframe for the probabilities of default
preds_df = pd.DataFrame(preds[:,1], columns = ['prob_default'])

# Reassign the values of loan status based on the new threshold
preds_df['loan_status'] = preds_df['prob_default'].apply(lambda x: 1 if x > 0.4 else 0)

# Store the number of loan defaults from the prediction data
num_defaults = preds_df['loan_status'].value_counts()[1]

# Store the default recall from the classification report
default_recall = precision_recall_fscore_support(y_test,preds_df['loan_status'])[1][1]

# Calculate the estimated impact of the new default recall rate
print(num_defaults * avg_loan_amnt * (1 - default_recall))


# Print the classification report
target_names = ['Non-Default', 'Default']
print(classification_report(y_test, preds_df['loan_status'], target_names=target_names))



# threshold to max score
plt.plot(thresh,def_recalls)
plt.plot(thresh,nondef_recalls)
plt.plot(thresh,accs)
plt.xlabel("Probability Threshold")
plt.xticks(ticks)
plt.legend(["Default Recall","Non-default Recall","Model Accuracy"])
plt.show()

# XGBoost Tree 
use gini

good for interpretation, segmentation, and insight

any purity gain after tree split?

In [None]:
predictors = ['borrower_score', 'payment_inc_ratio']
outcome = 'outcome'

X = loan3000[predictors]
y = loan3000[outcome]

loan_tree = DecisionTreeClassifier(random_state=1, criterion='entropy',
                                   min_impurity_decrease=0.003)
loan_tree.fit(X, y)
# check loan_tree.feature_importances_
rf_all = RandomForestClassifier(n_estimators=200, random_state=1)
rf_all.fit(X, y)
rf_all.feature_importances_

plotDecisionTree(loan_tree, feature_names=predictors, class_names=loan_tree.classes_)


# random forest 
rf = RandomForestClassifier(n_estimators=500, random_state=1, 
                            oob_score=True)
rf.fit(X, y)

xgb = XGBClassifier(objective='binary:logistic', subsample=.63)
print(xgb.fit(X, y))

In [None]:
# results
xgb_df = X.copy()
xgb_df['prediction'] = xgb.predict(X)
xgb_df['prob_default'] = xgb.predict_proba(X)[:, 0]
print(xgb_df.head())