In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [None]:
### SPLIT THE DATA INTO TRAINING AND TESTING SETS

In [None]:
## STEP 1: READ THE DATA FROM RESOURCES INTO A PANDAS DATAFRAME

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data = r'C:\Users\ellio\Documents\module_20_challenge\credit-risk-classification\Credit_Risk\Starter_Code\Resources\lending_data.csv'
lending_df = pd.read_csv(lending_data)

# Review the DataFrame
display(lending_df.head())
# statistics
lending_df.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.032243
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.176646
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0


In [None]:
## STEP 2: CREATE LABELS FROM THE LOAN_STATUS COLUMN AND CREATE FEATURES DATAFRAME FROM REMAINING COLUMNS

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop(columns=['loan_status'])

In [4]:
# Review the y variable Series
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [None]:
## STEP 3: CHECK THE BALANCE OF THE LABELS VARIABLE USING THE VALUE_COUNTS FUNCTION.

In [6]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [None]:
## STEP 4: SPLIT THE DATA INTO TRAINING AND TESTING DATASETS USING TRAIN_TEST_SPLIT

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    stratify=y,
                                                    random_state=1, 
                                                    )

In [None]:
### CREATE A LOGIC REGRRESSION MODEL WITH THE ORIGINAL DATA

In [None]:
## STEP 1: FIT A LOGISTIC REGRESSION MODEL USING THE TRAINING DATA

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

classifier = LogisticRegression(solver='lbfgs',
                                max_iter = 200,
                                random_state=1)

# Fit the model using training data

lr_model = classifier.fit(X_train, y_train)

In [None]:
## STEP 2: SAVE THE PREDICTIONS ON THE TESTING DATA BY USING THE TESTING FEATURE DATA AND THE FITTED MODEL

In [9]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
36831,0,0
75818,0,1
36563,0,0
13237,0,0
43292,0,0
...,...,...
38069,0,0
36892,0,0
5035,0,0
40821,0,0


In [None]:
## STEP 3: ELAVULATE THE MODEL'S PERFORMANCE BY DOING THE FOLLOWING:

In [None]:
## Calculate the accuracy score of the model.
## Generate a confusion matrix
## Print the classification report

In [10]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9442676901753825

In [11]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
confusion_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(confusion_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18679,80
Actual 1,67,558


In [12]:
# Print the classification report for the model
print("Classification Report")
target_names = ["healthy loan","high-risk loan"]
print(classification_report(y_test, predictions, target_names = target_names))

Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     18759
high-risk loan       0.87      0.89      0.88       625

      accuracy                           0.99     19384
     macro avg       0.94      0.94      0.94     19384
  weighted avg       0.99      0.99      0.99     19384



In [None]:
## STEP 4: 
# QUESTION: HOW WELL DOES THE LOGISTIC REGRESSION MODEL PREDICT BOTH THE 0 AND 1 LABELS?
# ANSWER: 
HEALTHY LOAN PRECISION IS 1.00, BEING IT PREDICTS EXCEPTIONALLY WELL. HIGH-RISK LOAN IS ALSO HIGH AT .87, BUT THERE IS A FAIR DIFFERENCE BETWEEN THE TWO. 
THE F1 SCORE IS THE MEAN OF PRECISION AND RECALL AND FOR BOTH ARE VERY HIGH. FOR HEALTHY LOA, F1 IS PERFECT. THE F1 SCORE SHOWS SLIGHT IMPROVEMENT TO HIGH-RISK LOANS.
OVERALL, THIS MODEL PERFORMS WELL.

In [14]:
### PREDICT A LOGISTIC REGRESSION MODEL WITH RESAMPLED TRAINING DATA

In [None]:
## STEP 1: USE THE RANDOMOVERSAMPLER MODULE FROM THE IMBALANCED-LEARN LIBRARY TO RESAMPLE THE DATA

In [15]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model

model_ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model

X_resampled, y_resampled = model_ros.fit_resample(X_train, y_train)

In [16]:
# Count the distinct values of the resampled labels data
display(y_resampled.value_counts())
print('-'*115 + '\n')
display(X_resampled.value_counts())

0    56277
1    56277
Name: loan_status, dtype: int64

-------------------------------------------------------------------------------------------------------------------



loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  derogatory_marks  total_debt
19900.0    11.596         89700            0.665552        13               2                 59700         185
19100.0    11.246         86400            0.652778        12               2                 56400         163
19300.0    11.316         87100            0.655568        12               2                 57100         162
18500.0    10.976         83900            0.642431        12               2                 53900         162
19300.0    11.310         87000            0.655172        12               2                 57000         156
                                                                                                           ... 
17800.0    10.696         81300            0.630996        11               2                 51300           1
12300.0    8.371          59400            0.494949        6                1                 29400           1

In [None]:
## STEP 2: USE THE LOGISTICREGRESSION CLASSIFIER AND THE RESAMPLED DATA TO FIT THE MODEL AND MAKE PREDICTIONS

In [17]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

classifier_ros = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data

classifier_ros.fit(X_resampled, y_resampled)

# Make a prediction using the testing data

predictions_ros = classifier_ros.predict(X_test)
pd.DataFrame({"Prediction": predictions_ros, "Actual": y_test})

Unnamed: 0,Prediction,Actual
36831,0,0
75818,1,1
36563,0,0
13237,0,0
43292,0,0
...,...,...
38069,0,0
36892,0,0
5035,0,0
40821,0,0


In [None]:
## STEP 3: EVALUATE THE MODEL'S PERFORMANCE BY DOING THE FOLLOWING:
## Calculate the accuracy score of the model
## Generate a confusion matrix

In [18]:
# Print the balanced_accuracy score of the model 

balanced_accuracy_score(y_test, predictions_ros)

0.9959744975744975

In [19]:
# Generate a confusion matrix for the model

matrix = confusion_matrix(y_test, predictions_ros)
matrix_df = pd.DataFrame(
   matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(matrix_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18668,91
Actual 1,2,623


In [20]:
# Print the classification report for the model

print("ROS Classification Report")
target_names = ["healthy loan","high-risk loan"]
print(classification_report(y_test, predictions_ros, target_names = target_names))

ROS Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     18759
high-risk loan       0.87      1.00      0.93       625

      accuracy                           1.00     19384
     macro avg       0.94      1.00      0.96     19384
  weighted avg       1.00      1.00      1.00     19384



In [None]:
## STEP 4: 
## QUESTION: HOW WELL DOES THE LOGISTIC REGRESSION MODEL PREDICT BOTH THE 0 AND 1 LABELS?
## ANSWER: FOR THE HEALTHY LOAN, WE GET A 1.00 FOR PRECISION WHICH MEANS PERFECT. THE HIGH-RSIK LOAN IS JUST A LITTLE HIGHER THAN IT WAS IN THE LAST REPORT AT 0.87, AND THE F1 SCORE IS A VERYT HIGH 0.93.
THIS MODEL PERFORMS EXTREMELY WELL.