In [None]:
! pip3 install matplotlib plotly scikit-learn xgboost kaleido

In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import sklearn
from sklearn import model_selection, metrics, neighbors, linear_model
import xgboost
from xgboost import *
import os
from sklearn import calibration
import kaleido

In [None]:
if not os.path.exists("images"):
    os.mkdir("images")

# loading data from csv into dataframe
data = pd.read_csv("credit_risk_dataset.csv")

# Finding null/nan values in dataframe columns
print(data.isnull().sum())

# removing null values from dataframe, as they reperesent a small percentage of the 32,000 borrowers
data = data.dropna(axis=0)

# Confirming the null values were removed
print(data.isnull().sum())

# Describing the data set, and identification of outliers:
print(data.describe())

# Outliers that need to be removed, as these could negatively skew our model

# Another way in which we can identify outliers is through various scatterplots.

fig = px.scatter_matrix(data,
                        dimensions=["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate"],
                        labels={col: col.replace('_', ' ') for col in data.columns},
                        height=900, color="loan_status", color_continuous_scale=px.colors.diverging.Tealrose)

# fig.show()


# One can then observe that income also has an outlier. We remove these as follows
data = data[data["person_age"] <= 100]
data = data[data["person_emp_length"] <= 100]
data = data[data["person_income"] <= 4000000]

fig = px.scatter_matrix(data,
                        dimensions=["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate"],
                        labels={col: col.replace('_', ' ') for col in data.columns},
                        height=900, color="loan_status", color_continuous_scale=px.colors.diverging.Tealrose)

# fig.show()


"""
Given the nature of our dataset, we’d expect that we’re dealing with an imbalanced classification problem, 
meaning that we have considerably more non-default cases than default cases. 
Using the code below, we confirm that this is indeed the case with 78.4% of our dataset containing non-default cases.
"""

# Calculating Percentage of non-default cases
data_0 = data[data["loan_status"] == 0].loan_status.count() / data.loan_status.count()
print(data_0)

"""
With this in mind, we’ll now further explore how loan status is related to other variables in our dataset.
"""
# Seeing how loan status relates to other variables
fig = px.box(data, x="loan_grade", y="loan_percent_income", color="loan_status",
             color_discrete_sequence=px.colors.qualitative.Dark24,
             labels={col: col.replace('_', ' ') for col in data.columns},
             category_orders={"loan_grade": ["A", "B", "C", "D", "E", "F"]})
fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))

# fig.show()

# fig.write_image("images/Box.png")
"""
Two things quickly stand out when we look at this box plot. We can clearly see that those who don’t default have a lower loan to income ratio mean value across all loan grades; \
which doesn’t come as a surprise.We can also see that no borrowers with loan grade G were able to repay their loan!
"""

"""
Using a parallel category diagram, we can understand how different categorical variables\
 in our dataset are related to each other and we can map out these relationships\
  on the basis of loan status.
"""

"""
#Parallel category diagram
fig = px.parallel_categories(data, color_continuous_scale=px.colors.sequential.RdBu, color="Loan_Status",
dimensions=['Home_Status', 'Loan_Intent', "loan_Grade", 'Historical_Default'], labels={col:col.replace('_', ' ') for col in data.columns})
fig.show()
"""
# Parallel category diagram
fig = px.parallel_categories(data, color_continuous_scale=px.colors.sequential.RdBu, color="loan_status",
                             dimensions=['person_home_ownership', 'loan_intent', "loan_grade",
                                         'cb_person_default_on_file'],
                             labels={col: col.replace('_', ' ') for col in data.columns})
# fig.write_image("images/Parallel")
# fig.show()
"""
Main takeaways from the above diagram:
Our dataset is primarily composed of borrowers who have not defaulted on a loan before;
  Loan grades “A” and “B” are the most common grades while “F” and “G” are the least common;
Home renters defaulted more often on their loans than those with a mortgage, whereas homeowners defaulted the least;
Borrowers took out a loan for home improvement the least and for education the most. Also, defaults were more common for loans that were taken up for covering medical expenses and debt consolidation.
"""

"""
Dealing with categorical variables and their labeling with dummy variables
"""
df = pd.get_dummies(data=data,
                    columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])
print(df)

# Splitting dataset
Y = df['loan_status']
X = df.drop('loan_status', axis=1)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, random_state=0, test_size=0.2)


def model_assess(model, name="Default"):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    preds_proba = model.predict_proba(x_test)
    print('              ', name, '\n', metrics.classification_report(y_test, model.predict(x_test)))


knn = neighbors.KNeighborsClassifier(n_neighbors=151)
model_assess(knn, name="KNN")

lg = linear_model.LogisticRegression(random_state=0)
model_assess(lg, name="LG")

xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_assess(xgb, 'XGBoost')

'''
f = open("classification_report.txt","w")
f.write(model_assess(knn,name="KNN"))
f.write(model_assess(lg,name="LG"))
f.write(model_assess(xgb, 'XGBoost'))
'''

# ROC AUC
fig = plt.figure(figsize=(14, 10))
plt.plot([0, 1], [0, 1], 'r--')

# KNN
preds_proba_knn = knn.predict_proba(x_test)
probsknn = preds_proba_knn[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsknn)
aucknn = metrics.roc_auc_score(y_test, probsknn)
plt.plot(fpr, tpr, label=f'KNN,AUC={str(round(aucknn, 3))}')

# Logistic Regression
preds_proba_lg = lg.predict_proba(x_test)
probslg = preds_proba_lg[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probslg)
auclg = metrics.roc_auc_score(y_test, probslg)
plt.plot(fpr, tpr, label=f'Logistic Regression, AUC = {str(round(auclg, 3))}')

# XGBOOST
preds_proba_xgb = xgb.predict_proba(x_test)
probsxgb = preds_proba_xgb[:, 1]
fpr, tpr, thresh, = metrics.roc_curve(y_test, probsxgb)
aucxgb = metrics.roc_auc_score(y_test, probslg)
plt.plot(fpr, tpr, label=f'XGBoost, AUC = {str(round(aucxgb, 3))}')

plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 18
plt.legend()
plt.savefig("images/ROC")
plt.show()

# Reliability plot and Brier Score
fig = plt.figure(figsize=(14, 10))
plt.plot([0, 1], [0, 1], color="black")

# KNN
knn_y, knn_x = calibration.calibration_curve(y_test, preds_proba_knn[:, 1], n_bins=10, normalize=True)
lost_knn = metrics.brier_score_loss(y_test, preds_proba_knn[:, 1])
plt.plot(knn_x, knn_y, marker='o', label=f'KNN, Brier Score = {str(round(lost_knn, 3))}')

# Logistic Regression
lg_y, lg_x = calibration.calibration_curve(y_test, preds_proba_lg[:, 1], n_bins=10, normalize=True)
loss_lg = metrics.brier_score_loss(y_test, preds_proba_lg[:, 1])
plt.plot(lg_x, lg_y, marker='o', label=f'LG, Brier Score = {str(round(loss_lg, 3))}')

# XGBOOST
preds_proba_xgb = xgb.predict_proba(x_test)
xgb_y, xgb_x = calibration.calibration_curve(y_test, preds_proba_xgb[:, 1], n_bins=10, normalize=True)
loss_xgb = metrics.brier_score_loss(y_test, preds_proba_xgb[:, 1])
plt.plot(xgb_x, xgb_y, marker="o", label=f'XGBoost, Brier Score ={str(round(loss_xgb, 3))}')

plt.ylabel("Actual Probability")
plt.xlabel("Predicted Probability")
plt.title("Reliability Plot")
plt.rcParams['axes.titlesize'] = 18
plt.legend()
plt.savefig("images/RealiabilityPlot")
plt.show()

# Feature importance plot
fig, (ax1, ax2) = plt.subplots(figsize=(15, 17), ncols=1, nrows=2)
plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9, wspace=0, hspace=0.5)
plot_importance(xgb, importance_type='gain', ax=ax1)
ax1.set_title('Feature Importance by Information Gain', fontsize=18)
ax1.set_xlabel('Gain')
plt.savefig("images/FeatureImportance")
plt.show()
