### Using Best Model (RF) to apply on Bank Loan Test Data

In [1]:
import pandas as pd

# load test set and preprocess
test_df = pd.read_csv('bank_loan_testdata.csv')


In [2]:
test_df.isnull().sum()

loan_id              0
age                  0
education           92
proof_submitted      0
loan_amount          0
asset_cost           0
no_of_loans          0
no_of_curr_loans     0
last_delinq_none     0
dtype: int64

In [3]:
test_df.duplicated().sum()

np.int64(0)

In [4]:
# cleaning

# drop id column
test_df = test_df.drop(columns=['loan_id'])

# fill na with median 
test_df.dropna(subset=['education'], inplace=True)

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# most values are small, but a few are extremely large, this creates a skewed distribution
# hence, apply a log transformation to compress large values while keeping small values meaningful

print("Skewness before transformation:")
print(test_df[['loan_amount', 'asset_cost']].skew())

# apply log transformation
# add a small constant (1) to avoid log(0) issues if there are zeros in your data
test_df['loan_amount_log'] = np.log(test_df['loan_amount'] + 1)
test_df['asset_cost_log'] = np.log(test_df['asset_cost'] + 1)

print("\nSkewness after transformation:")
print(test_df[['loan_amount_log', 'asset_cost_log']].skew())


Skewness before transformation:
loan_amount    0.64386
asset_cost     2.29723
dtype: float64

Skewness after transformation:
loan_amount_log   -0.694090
asset_cost_log     1.188239
dtype: float64


In [6]:
# create interaction features using log-transformed columns

# this ratio shows how much of the asset cost is financed by a loan, 
# higher ratio (close to 1) might indicate higher risk because the borrower financed most of the asset cost via a loan
test_df['loan_to_value_ratio_log'] = test_df['loan_amount_log'] / test_df['asset_cost_log']

# this measures how many current loans a borrower has compared to their total loans
# high ratio (close to 1) may indicate financial stress, increasing the likelihood of default
test_df['debt_load_ratio_log'] = test_df['no_of_curr_loans'] / (test_df['no_of_loans'] + 1)

In [7]:
import joblib

# Load the model and scaler
model_filename = 'random_forest.pkl'
scaler_filename = 'scaler.pkl'

model = joblib.load(model_filename)
scaler = joblib.load(scaler_filename)

# Extract the features for the model and scaler
model_features = ['age', 'education', 'last_delinq_none', 'loan_to_value_ratio_log', 'debt_load_ratio_log']
scaler_features = ['age', 'loan_to_value_ratio_log', 'debt_load_ratio_log']

X_model = test_df[model_features].copy()
X_scaler = test_df[scaler_features].copy()

# Scale the necessary features
X_scaled = scaler.transform(X_scaler)

# Replace the scaled values in the corresponding columns of the model features
X_model[scaler_features] = X_scaled

# Make predictions
predictions = model.predict(X_model)

# Optionally, add predictions to the test dataframe
test_df['predictions'] = predictions

In [8]:
test_df['predictions'].value_counts()

predictions
0    1932
1     976
Name: count, dtype: int64