In [8]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

## Step 1:

In [9]:
df_lending = pd.read_csv(
    Path("/Users/asanyusuf/desktop/Starter_Code/Resources/lending_data.csv")
)

df_lending.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


## Step 2 :

In [10]:
y = df_lending["loan_status"]
X= df_lending.drop(columns = "loan_status")

In [11]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [13]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


## Step 3 :

In [15]:
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

## Step 4:

In [16]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, random_state =1 )

In [18]:
from sklearn.preprocessing import StandardScaler

In [20]:
# scale the data using StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Creat a Logistic Regression Model with the Original Data

## Step 1 :

In [23]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression (random_state = 1 ) 

lr_model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

# Step 2 :

In [24]:
y_pred = lr_model.predict(X_test_scaled)

# Step 3 :

In [38]:
# balance score of the model:
balanced_accuracy_score(y_test, y_pred)

0.9889115309798473

In [39]:
# confusion Matrix for the model
confusion_matrix(y_test, y_pred)

array([[18652,   113],
       [   10,   609]])

In [40]:
# classification reprot
target_names = ["0", "1"]
print(classification_report_imbalanced(y_test, y_pred, target_names=target_names))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.98      1.00      0.99      0.98     18765
          1       0.84      0.98      0.99      0.91      0.99      0.98       619

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



#Answer : the model does great job on predicting the "healthy loan" as its precision is at 1 howeve not so well on predicting the "high risk loan as it's precision at 0.84¶

# Predict a Logistic Regression Model with Resampled Training Data

# Step 1:

In [47]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler (random_state =1 ) 

In [48]:
X_oversampled, y_oversampled = ros.fit_resample(X_train_scaled, y_train)

In [49]:
y_oversampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

# Step 2 :

In [52]:
model_oversampled = LogisticRegression()

model_oversampled.fit(X_oversampled, y_oversampled)

LogisticRegression()

In [53]:
y_pred_oversampled = model_oversampled.predict(X_test_scaled)

# Step 3: 

In [54]:
balanced_accuracy_score(y_test, y_pred_oversampled)

0.9934383134311076

In [55]:
confusion_matrix(y_test, y_pred_oversampled)

array([[18640,   125],
       [    4,   615]])

In [56]:
# classification reprot
target_names = ["0", "1"]
print(classification_report_imbalanced(y_test, y_pred_oversampled, target_names=target_names))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.83      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



# Answers:  the prediction on Oversampled data with Logistic regression model ended up being very similar to our pre-oversampled data's result. Both model had high precision on "healthy Loan" however the oversampled data had lightly better precision on "High Risk Loan".