Import basic libraries.

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Load the Dataset.

In [127]:
loan_data = pd.read_csv('/content/loan_data.csv')

Analyse the data

In [128]:
loan_data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [129]:
loan_data.columns

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file', 'loan_status'],
      dtype='object')

In [130]:
loan_data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [131]:
loan_data = loan_data[loan_data['previous_loan_defaults_on_file'] != 1]
loan_data = loan_data.reset_index(drop=True)

In [132]:
loan_data.shape

(45000, 14)

In [133]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [134]:
loan_data.isnull().sum()

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


Data preprocessing

In [135]:
# impoprt libraries for preprocessing.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV


Splitting the data

In [136]:
# Assuming one-hot encoding has been performed and categorical columns have been dropped
# The updated loan_data now contains only numerical and one-hot encoded features

X = loan_data.drop(['loan_status','loan_int_rate'], axis=1) # Drop the target and loan_int_rate as before
y = loan_data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Using a more standard 80/20 split

Encoding and Scaling the columns

In [137]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

categorical_cols = ['person_gender', 'person_education',
                    'person_home_ownership', 'loan_intent',
                    'previous_loan_defaults_on_file']

numeric_cols = ['person_age', 'person_income', 'person_emp_exp',
                'loan_amnt', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ])


In [138]:
X_train_preprossed = preprocessor.fit_transform(X_train)
X_test_preprossed = preprocessor.transform(X_test)

Model Training.

In [139]:
from sklearn.ensemble import RandomForestClassifier

In [140]:
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=15,
    random_state=42,
    n_jobs=-1,
    min_samples_split = 2,
    min_samples_leaf = 1,
    class_weight = 'balanced'
)

In [141]:
rf_model.fit(X_train_preprossed, y_train)


In [142]:
y_pred = rf_model.predict(X_test_preprossed)

Check accuracy score.

In [143]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix


In [144]:
print("Random Forest Results:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-Score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Random Forest Results:
Accuracy : 0.8744444444444445
Precision: 0.6775625504439063
Recall   : 0.8353233830845771
F1-Score : 0.7482174688057041
ROC-AUC  : 0.860508615719685

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.89      0.92      6990
           1       0.68      0.84      0.75      2010

    accuracy                           0.87      9000
   macro avg       0.81      0.86      0.83      9000
weighted avg       0.89      0.87      0.88      9000


Confusion Matrix:
 [[6191  799]
 [ 331 1679]]


In [145]:
import joblib

# Save
joblib.dump(rf_model, "loan_model.pkl")
joblib.dump(preprocessor, "encoder.pkl")

# Load
model = joblib.load("loan_model.pkl")
preprocessor = joblib.load("encoder.pkl")
