# *Summary*

## This notebook is an extension of the credit risk modelling from credit log modelling.

### In this notebook, the credit risk modelled with gradient boosted trees for predicting loan defaults with probability. This can be used to automate approving and declining loan applcations more accurately.

### A recall of 73% and 93.3% accuracy level was achieved in predicting the loan defaults on 32,576 loans and 12 benchmarks with a probability threshold of 60%. With this model, the default rate would decrease from 21.8% to 6.7%, resulting in minimized risk for both the lender and applicant.

   
### The top 5 most important features of determining a loan default depends on the applicant's income, age, and employment length and the loan's interest rate and amount.


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import model_selection,linear_model, metrics
import xgboost as xgb
from sklearn.metrics import classification_report

In [2]:
cr_data = pd.read_csv("credit_risk_dataset.csv")

emp_len_null = cr_data[cr_data['person_emp_length'].isnull()].index
int_rate_null = cr_data[cr_data['loan_int_rate'].isnull()].index

cr_data['person_emp_length'].fillna((cr_data['person_emp_length'].median()), inplace=True)
cr_data['loan_int_rate'].fillna((cr_data['loan_int_rate'].median()), inplace = True)

cr_data = cr_data.rename(columns = {"cb_person_default_on_file":"default_hist", "cb_person_cred_hist_length": "cr_hist_len"})

cr_clean1 = cr_data[cr_data['person_age']<=100]

# one hot encoding categorical variables
num_col = cr_clean1.select_dtypes(exclude = 'object')
char_col = cr_clean1.select_dtypes(include = 'object')

encoded_char_col = pd.get_dummies(char_col)

cr_clean2 = pd.concat([num_col, encoded_char_col], axis=1)
cr_clean2

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cr_hist_len,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,default_hist_N,default_hist_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,False,False,...,False,False,False,False,True,False,False,False,False,True
1,21,9600,5.0,1000,11.14,0,0.10,2,False,False,...,False,False,True,False,False,False,False,False,True,False
2,25,9600,1.0,5500,12.87,1,0.57,3,True,False,...,False,False,False,True,False,False,False,False,True,False
3,23,65500,4.0,35000,15.23,1,0.53,2,False,False,...,False,False,False,True,False,False,False,False,True,False
4,24,54400,8.0,35000,14.27,1,0.55,4,False,False,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,True,False,...,False,False,False,True,False,False,False,False,True,False
32577,54,120000,4.0,17625,7.49,0,0.15,19,True,False,...,False,True,False,False,False,False,False,False,True,False
32578,65,76000,3.0,35000,10.99,1,0.46,28,False,False,...,False,False,True,False,False,False,False,False,True,False
32579,56,150000,5.0,15000,11.48,0,0.10,26,True,False,...,False,False,True,False,False,False,False,False,True,False


In [3]:
# Split Train and Test Sets
Y = cr_clean2['loan_status']
X = cr_clean2.drop('loan_status',axis=1)
 


x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, random_state=2020, test_size=.30)

#Start of gradient boosted tree

xgb_model = xgb.XGBClassifier() # initialize tree

xgb_model.fit(x_train, np.ravel(y_train)) # train tree

predict_xgb = xgb_model.predict_proba(x_test) # 1st col = pred val, 2nd col = pred prob

predict_xgb_prob = pd.DataFrame(predict_xgb[:,1],columns = ['Default Probability'])

pd.concat([predict_xgb_prob, y_test.reset_index(drop=True)],axis=1)


Unnamed: 0,Default Probability,loan_status
0,0.033276,0
1,0.000701,0
2,0.011722,0
3,0.002813,0
4,0.107398,0
...,...,...
9768,0.045192,0
9769,0.015024,0
9770,0.999846,1
9771,0.044685,0


In [4]:
X

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cr_hist_len,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,default_hist_N,default_hist_Y
0,22,59000,123.0,35000,16.02,0.59,3,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,21,9600,5.0,1000,11.14,0.10,2,False,False,True,...,False,False,True,False,False,False,False,False,True,False
2,25,9600,1.0,5500,12.87,0.57,3,True,False,False,...,False,False,False,True,False,False,False,False,True,False
3,23,65500,4.0,35000,15.23,0.53,2,False,False,False,...,False,False,False,True,False,False,False,False,True,False
4,24,54400,8.0,35000,14.27,0.55,4,False,False,False,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0.11,30,True,False,False,...,False,False,False,True,False,False,False,False,True,False
32577,54,120000,4.0,17625,7.49,0.15,19,True,False,False,...,False,True,False,False,False,False,False,False,True,False
32578,65,76000,3.0,35000,10.99,0.46,28,False,False,False,...,False,False,True,False,False,False,False,False,True,False
32579,56,150000,5.0,15000,11.48,0.10,26,True,False,False,...,False,False,True,False,False,False,False,False,True,False


In [5]:
print("There are {} features in cr_clean2".format(cr_clean2.shape[1]))
round(xgb_model.score(x_test,y_test),3)

There are 27 features in cr_clean2


0.932

## Evaluation Metrics

In [6]:
y_predict_xgb = xgb_model.predict(x_test)

print(classification_report(y_test, y_predict_xgb))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7684
           1       0.94      0.73      0.82      2089

    accuracy                           0.93      9773
   macro avg       0.94      0.86      0.89      9773
weighted avg       0.93      0.93      0.93      9773



SyntaxError: invalid syntax (226626043.py, line 1)

In [None]:
# display feature and their importance
feat_imp = xgb_model.get_booster().get_score(importance_type='weight')

feat_imp

In [None]:
# columns not used
set(x_train.columns) - set(feat_imp)

The only feture not used was whether an applicant had a history of default.

In [None]:
# display top 5 most import features
sorted(feat_imp.items(), key=lambda kv: kv[1],reverse=True)[0:5]

The top 5 most important features of determining a loan default depends on the applicant's income, age, and employment length and the loan's interest rate and amount.

In [None]:
import operator as op
sorted_feat_imp = dict(sorted(feat_imp.items(), key=op.itemgetter(1),reverse=True))
sorted_feat_imp

In [None]:
xgb.plot_importance(xgb_model,importance_type='weight')

In [None]:

remove_feats = []

for key in feat_imp.keys():
    if feat_imp[key] < 40:    # tried 
        remove_feats.append(key)
        
imp_data = X
for key in remove_feats:
    imp_data = imp_data.drop(key,axis=1)

In [None]:
imp_data.shape

The data has been reduced to 15 important features

In [None]:
display_most_imp_feat ={}
for key in list(imp_data.columns):
    if key  in feat_imp.keys():
        display_most_imp_feat[key] = feat_imp[key]

sorted(display_most_imp_feat.items(), key=lambda kv: kv[1],reverse=True)

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(imp_data, Y, random_state=2020, test_size=.30)

#Start of gradient boosted tree

xgb_model = xgb.XGBClassifier() # initialize tree

xgb_model.fit(x_train, np.ravel(y_train)) # train tree

predict_xgb = xgb_model.predict_proba(x_test) # 1st col = pred val, 2nd col = pred prob

predict_xgb_prob = pd.DataFrame(predict_xgb[:,1],columns = ['Default Probability'])

pd.concat([predict_xgb_prob, y_test.reset_index(drop=True)],axis=1)


In [None]:
from sklearn.metrics import classification_report

In [None]:
y_preds =  xgb_model.predict(x_test)


print(classification_report(y_test, y_preds))


In [None]:
round(xgb_model.score(x_test,y_test),3)

In [None]:
thresh = np.linspace(0,1,21)
thresh

In [None]:
def find_opt_thresh(predict,thr =thresh, y_true = y_test):
    data = predict
    
    def_recalls = []
    nondef_recalls = []
    accs =[]

    
    for threshold in thr:
        # predicted values for each threshold
        data['loan_status'] = data['Default Probability'].apply(lambda x: 1 if x > threshold else 0 )
        
        accs.append(metrics.accuracy_score(y_true, data['loan_status']))
        
        stats = metrics.precision_recall_fscore_support(y_true, data['loan_status'])
        
        def_recalls.append(stats[1][1])
        nondef_recalls.append(stats[1][0])
        
        
    return accs, def_recalls, nondef_recalls

accs, def_recalls, nondef_recalls= find_opt_thresh(predict_xgb_prob)


In [None]:
plt.plot(thresh,def_recalls)
plt.plot(thresh,nondef_recalls)
plt.plot(thresh,accs)
plt.xlabel("Probability Threshold")
plt.xticks(thresh, rotation = 'vertical')
plt.legend(["Default Recall","Non-default Recall","Model Accuracy"])
#plt.axvline(x=0.45, color='pink')
plt.show()

In [None]:
optim_threshold = accs.index(max(accs))

print(round(accs[optim_threshold],3))

thresh[optim_threshold]

In [None]:
cr_clean2.shape[1] - imp_data.shape[1]
# num of total features (including dummies) subtract important features with F score >= 40

In [None]:
# original loan defaults by previous default history
default_hist_status_tab= pd.crosstab(cr_clean1['default_hist'], cr_clean1['loan_status'])
default_hist_status_tab

In [None]:
round(default_hist_status_tab.iloc[:,1].sum() /cr_data.shape[0],3)

**The accuracy of the model was 93.3% and the recall was 0.73 with 15 important features and 12 feature removed once the data had been cleaned and encoded.**

**The gradient boosted tree performed 7.3% better than the logistics regression and had a higher probability threshold for default by 15%.**

**The initial loan default rate was 21.8%. With the `xgb_model`, the loan default rate should decrease to 6.7%. With this 15.1% improvement in default rate the lenders and applicants are even more so protected from risk.**