In [64]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [69]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_lending_data = pd.read_csv(
    "Resources/lending_data.csv")


# Review the DataFrame
df_lending_data.head(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


In [71]:
# Review the DataFrame - Last 10 rows
df_lending_data.tail(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77526,18300.0,10.895,83100,0.638989,11,2,53100,1
77527,20900.0,11.988,93400,0.678801,14,3,63400,1
77528,15100.0,9.557,70500,0.574468,9,2,40500,1
77529,19300.0,11.347,87400,0.656751,12,2,57400,1
77530,19700.0,11.508,88900,0.662542,13,2,58900,1
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [74]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_lending_data["loan_status"]

# Separate the X variable, the features
X = df_lending_data.drop("loan_status", axis=1)

In [76]:
# Review the y variable Series
print(y.head(10))
print(y.info())
print(y.value_counts())

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: loan_status, dtype: int64
<class 'pandas.core.series.Series'>
RangeIndex: 77536 entries, 0 to 77535
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
77536 non-null  int64
dtypes: int64(1)
memory usage: 605.9 KB
None
loan_status
0    75036
1     2500
Name: count, dtype: int64


In [78]:
# Review the X variable DataFrame
print(X.head(10))
print(X.info())
print(X.describe())
print(X.columns)

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   
5    10100.0          7.438            50600        0.407115                4   
6    10300.0          7.490            51100        0.412916                4   
7     8800.0          6.857            45100        0.334812                3   
8     9300.0          7.096            47400        0.367089                3   
9     9700.0          7.248            48800        0.385246                4   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2     

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [81]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [86]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
log_reg = LogisticRegression(solver='lbfgs', random_state=1)


# Fit the model using training data
log_reg.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [89]:
# Make a prediction using the testing data
y_predict = log_reg.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [92]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_predict)

array([[18679,    80],
       [   67,   558]], dtype=int64)

In [94]:
# Print the classification report for the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.89      0.88       625

    accuracy                           0.99     19384
   macro avg       0.94      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** This logistic model is a very strong predictor of healthy loan data points. The precision and recall is 100% meaning this Logistic Regression function detects ALL or nearly all the borrowers with the given variables and correctly predicts the outcome of a healthy loan extremely accurately. The model correctly identifies all healthy loans (true positives) and doesn't make any mistakes in either direction (false positives or false negatives).

However, when looking at the high risk loan labels, the logistic model is a moderatly strong indicator of the loan labels. Since the recall is 89% and it is only precise for only 87% of the data points, this logistic model can be used to make predictors for high risk loan status, but should be used with caution since there is room for error. Given the 11% false negative rate, there is a risk of underestimating the number of high-risk loans, which could lead to inadequate risk assessment and potential financial losses. Similarly, the 13% false positive rate may result in unnecessary scrutiny or denial of loans to borrowers who are not actually high-risk.

---

## Create a Linear Regression Model with the Original Data

###  Step 1: Fit a linear regression model by using the training data (`X_train` and `y_train`).

In [99]:
# Import the LinearRegression module from SKLearn
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
# Assign a random_state parameter of 1 to the model (not necessary, but for reproducibility)
lin_reg = LinearRegression(n_jobs=-1)  # n_jobs=-1 uses all available CPU cores

# Fit the model using training data
lin_reg.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [129]:
# Make a prediction using the testing data
y_predict = lin_reg.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the r2 
* Calculate the mean squared error
* Calculate the mean absolute error
* Calculate the Root Mean Squared Error

In [127]:
# Compute metrics for the linear regression model:  r2, mse, rmse, std
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

r2 = r2_score(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The r2 is 0.7816665288715994.
The mean squared error is 0.006812762007514698.
The root mean squared error is 0.08253945727659408.
The standard deviation is 0.1766450407386072.


### Step 4: Answer the following question.

**Question:** How well does the linear regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** This linear model is even less reliable than the logistic model. However, these results are weighted due to the large difference in the number of borrowers who were classified into the healthy and high-risk loan statuses. This data isn't shown as separate accounts displaying the accuracy for borrowers sorted into each loan status individually, so it is difficult to analyze the decision matrix between each status separately. However, when considered together, the linear regression model is a moderately good predictor of loan status.

Looking at the R-squared value, which is essentially an indicator of accuracy ranging from 0 (very low accuracy) to 1 (exactly accurate), we find a value of 0.78. In our case, the R-squared value is roughly aligned with the tested data points. However, there is still a large amount of variance between the data and the predictor, meaning there is a lot of room for false negatives and false positives using the linear model.