In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
file_path = Path("Resources/lending_data.csv")
lending_data = pd.read_csv(file_path)
lending_data

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [3]:
y = lending_data["loan_status"]

X = lending_data.drop(["loan_status"], axis = 1)

In [4]:
y

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

In [5]:
X

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression(random_state=1)
lr_model = logistic_regression_model.fit(X_train, y_train)
lr_model

In [8]:
testing_predictions = lr_model.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix

test_matrix = confusion_matrix(y_test, testing_predictions)
test_matrix

array([[18679,    80],
       [   67,   558]], dtype=int64)

In [13]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, testing_predictions)

0.9924164259182832

In [10]:
from sklearn.metrics import classification_report
label_names = ["Healthy Risk", "Unhealthy Risk"]
print(classification_report(y_test, testing_predictions, target_names = label_names))


                precision    recall  f1-score   support

  Healthy Risk       1.00      1.00      1.00     18759
Unhealthy Risk       0.87      0.89      0.88       625

      accuracy                           0.99     19384
     macro avg       0.94      0.94      0.94     19384
  weighted avg       0.99      0.99      0.99     19384



In [11]:
print("While our model overall does a good job at predicting both healthy and unhealthy loan risks, one could argue that a recall of 89% for predicting unhealthy loan risks is not high enough.")
print("\n")
print("A recall of 89% indicates that 11% of the positives in our model are actually false, which means that 11% of the bank´s loans were provided to borrowers with a high risk of defaulting.")
print("\n")
print("Depending on the size of the loans, this could lead to a significant loss of funds for the bank.")

While our model overall does a good job at predicting both healthy and unhealthy loan risks, one could argue that a recall of 89% for predicting unhealthy loan risks is not high enough.


A recall of 89% indicates that 11% of the positives in our model are actually false, which means that 11% of the bank´s loans were provided to borrowers with a high risk of defaulting.


Depending on the size of the loans, this could lead to a significant loss of funds for the bank.
