In [1]:
#  Importing required Python libraries

# NumPy is useful for numerical operations 
import numpy as np

# Pandas is used to load and work with CSV data like Excel tables
import pandas as pd

# Path from pathlib helps define file paths in a clean way
from pathlib import Path

# Splitting data into training and test sets
from sklearn.model_selection import train_test_split

# Logistic Regression model (used to classify loan risk)
from sklearn.linear_model import LogisticRegression

# Tools to evaluate model performance (confusion matrix + precision/recall report)
from sklearn.metrics import confusion_matrix, classification_report


---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [9]:
#  Step 1: Load the lending_data.csv file into a DataFrame

# Define the path to the CSV file (it's inside the Resources folder)
csv_path = Path("../Resources/lending_data.csv")

# Use pandas to read the CSV file
df = pd.read_csv(csv_path)

#  Show the first 5 rows of the DataFrame to make sure it loaded correctly
df.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the ‚Äúloan_status‚Äù column, and then create the features (`X`) DataFrame from the remaining columns.

In [11]:
#  Step 2: Separate the data into labels (y) and features (X)

# The label is what we want to predict ‚Äî loan_status (0 = healthy, 1 = high risk)
y = df["loan_status"]

# The features are all the other columns except loan_status
X = df.drop(columns="loan_status")

# ‚úÖ Let‚Äôs check the shapes to make sure it worked
print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)


Shape of X (features): (77536, 7)
Shape of y (labels): (77536,)


In [17]:
#  first few labels (loan_status values)
print("y (labels) preview:")
print(y.head())

# look at the features (borrower info without loan_status)
print("\nX (features) preview:")
print(X.head())


y (labels) preview:
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

X (features) preview:
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [22]:
#  Step 3: Split the data into training and testing sets

# 75% training, 25% testing, random_state ensures we always get the same split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1
)

#  Check the shapes to make sure it worked
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (58152, 7)
X_test shape: (19384, 7)
y_train shape: (58152,)
y_test shape: (19384,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [30]:
#  Step 4: Create and train the Logistic Regression model

#‚É£ Create (instantiate) the model with random_state=1 to keep results consistent
logistic_model = LogisticRegression(random_state=1)

#  Train (fit) the model using the training data
logistic_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [33]:
#  Step 5: Use the trained model to predict loan risk on the test set

# This will generate predicted loan_status values (0 or 1) for the test borrowers
y_pred = logistic_model.predict(X_test)

#  Preview the first 10 predictions
print("Predicted loan statuses:", y_pred[:10])


Predicted loan statuses: [0 0 0 0 0 0 0 0 0 0]


#  Step 6: Evaluate model performance

# 1Ô∏è‚É£ Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("üî≤ Confusion Matrix:")
print(cm)


In [44]:
# 2Ô∏è‚É£ Generate the classification report
report = classification_report(y_test, y_pred)
print("\nüìã Classification Report:")
print(report)



üìã Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.94      0.89       619

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model predicts both classes very well.
It predicts healthy loans (0) with 100% precision and 99% recall, which means it almost never makes a mistake when identifying good loans.
For high-risk loans (1), the model has a precision of 84% and a recall of 94%, which shows that it's very good at catching most risky loans while still being mostly accurate.
The overall accuracy of the model is 97%, which is very strong and shows that it performs well for both types of loans.