In [70]:
# Import pandas to read the csv file:
import pandas as pd

df = pd.read_csv("loan.csv")

In [71]:
# Display data
df.head()

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000,720,Approved
1,45,Female,Teacher,Master's,Single,62000,680,Approved
2,28,Male,Student,High School,Single,25000,590,Denied
3,51,Female,Manager,Bachelor's,Married,105000,780,Approved
4,36,Male,Accountant,Bachelor's,Married,75000,710,Approved


In [72]:
# We have to clean  up the data since alot of the features include string and also check for missing values
df.info()
# No missing value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              61 non-null     int64 
 1   gender           61 non-null     object
 2   occupation       61 non-null     object
 3   education_level  61 non-null     object
 4   marital_status   61 non-null     object
 5   income           61 non-null     int64 
 6   credit_score     61 non-null     int64 
 7   loan_status      61 non-null     object
dtypes: int64(3), object(5)
memory usage: 3.9+ KB


In [73]:
# Using get dummies to convert features into numbers
df= pd.get_dummies(df, columns=['education_level', 'marital_status', 'gender'], dtype=int)


In [74]:
print(df['loan_status'].unique())

['Approved' 'Denied']


In [75]:
# Also converting target variable from object to number
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Denied': 0})

In [76]:
# check whether occupation correlate with other features like income/credit score, if correlation 
# is low then drop if not then keep the feature.
df.head()

Unnamed: 0,age,occupation,income,credit_score,loan_status,education_level_Associate's,education_level_Bachelor's,education_level_Doctoral,education_level_High School,education_level_Master's,marital_status_Married,marital_status_Single,gender_Female,gender_Male
0,32,Engineer,85000,720,1,0,1,0,0,0,1,0,0,1
1,45,Teacher,62000,680,1,0,0,0,0,1,0,1,1,0
2,28,Student,25000,590,0,0,0,0,1,0,0,1,0,1
3,51,Manager,105000,780,1,0,1,0,0,0,1,0,1,0
4,36,Accountant,75000,710,1,0,1,0,0,0,1,0,0,1


In [77]:
# For Random Forest and XGBoost we do not need to scale the data because it's not as sensitive.


In [78]:

# For the Logistic Regression, KNN, and Neural Network we need to scale the data
# We will use the standard scaler to scale the data and we will drop occupation column

from sklearn.preprocessing import StandardScaler

df.drop('occupation', axis=1, inplace=True)

In [79]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [80]:
# split the data into test and train!

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
#Random Forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions
train_pred_rf = rf_model.predict(X_train)
test_pred_rf = rf_model.predict(X_test)

# Evaluate the model
train_acc_rf = accuracy_score(y_train, train_pred_rf)
test_acc_rf = accuracy_score(y_test, test_pred_rf)

# Print results
print("Random Forest Train Accuracy =", train_acc_rf)
print("Random Forest Test Accuracy =", test_acc_rf)

# 6. Interpretation
print("\nClassification Report:\n")
print(classification_report(y_test, test_pred_rf))

# Precision is Out of all predicted 1s, 100% were actually 1. (No false positives)
# Recall is Out of all true 1s, 100% were correctly identified. (No false negatives)
# F1-Score combines precision & recall
# Support states the number of actual examples per class (4 denied, 9 approved)


Random Forest Train Accuracy = 1.0
Random Forest Test Accuracy = 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         9

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



In [93]:

# XGBoost!

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


model = XGBClassifier(eval_metric='logloss', max_depth=3, min_child_weight=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 1.0


In [82]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [95]:
# Logistic Regression !
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
logReg = LogisticRegression()

# Train the model using scaled training data
logReg.fit(X_train_scaled, y_train)

# Make predictions
train_pred = logReg.predict(X_train_scaled)
test_pred = logReg.predict(X_test_scaled)

# Evaluate accuracy
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

# Print accuracy results
print("Logistic Regression Train Accuracy =", train_acc)
print("Logistic Regression Test Accuracy =", test_acc)

# Print classification report
print("\nClassification Report:\n")
print(classification_report(y_test, test_pred))


# 0 = loan denied, 1 = loan approved
# Model got 100% accuracy on both train and test sets
# Precision: all predicted approvals were correct
# Recall: all actual approvals were caught
# F1-score: perfect balance of precision and r

Logistic Regression Train Accuracy = 1.0
Logistic Regression Test Accuracy = 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         9

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



In [84]:
# KNN !

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)

# fitting scaled data
knn.fit(X_train_scaled, y_train)

# starting predictions
train_pred_knn = knn.predict(X_train_scaled)
test_pred_knn = knn.predict(X_test_scaled)

# evaluating
train_acc_knn = accuracy_score(y_train, train_pred_knn)
test_acc_knn = accuracy_score(y_test, test_pred_knn)

print("KNN Train Accuracy=", train_acc_knn)
print("Knn Test accuracy", test_acc_knn)



# Our KNN predicts that possibly out of a 100 samples it will get 92.3 % correct on whether
# someone will be approved for the loan.

KNN Train Accuracy= 0.9583333333333334
Knn Test accuracy 0.9230769230769231


In [85]:
# Neural Network!

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
y_test_sensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

#Defining the Neural Network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

input_size = X_train_scaled.shape[1]
hidden_size = 10
output_size = 2

model = NeuralNetwork(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the neural network
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# eval the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)

accuracy = (predicted == y_test_sensor).sum().item() / y_test_sensor.size(0)
print(f'Test Accuracy: {accuracy:.4f}')

# Our test accuracy of 0.9231 or 92.31% means our model will predict the approval 
# of loans correctly for about 92% of samples.

Epoch [10/100], Loss: 0.4551
Epoch [20/100], Loss: 0.4188
Epoch [30/100], Loss: 0.3836
Epoch [40/100], Loss: 0.3487
Epoch [50/100], Loss: 0.3157
Epoch [60/100], Loss: 0.2854
Epoch [70/100], Loss: 0.2583
Epoch [80/100], Loss: 0.2348
Epoch [90/100], Loss: 0.2146
Epoch [100/100], Loss: 0.1976
Test Accuracy: 0.8462
