In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from math import sqrt
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
from torch import nn
import torch

Due to the strong class imbalance, the dataset was augmented with sythetic data using the SMOTE method.

In [2]:
df = pd.read_csv('Equifax_pdl_US_2023.csv')
df.drop_duplicates()
df.drop(['client_id', 'application_date'], axis=1 , inplace=True)
df = pd.get_dummies(df, columns=['branch_category', 'gender', 'education_level'])
Y = df['default_status']
X = df.drop(['default_status'], axis=1)
smote = SMOTE(sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=-1)
X, Y = smote.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [6]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_predicted = log_reg.predict(x_test)
print(classification_report(y_predicted, y_test))
print(f'roc_auc_score: {roc_auc_score(y_predicted, y_test)}')

              precision    recall  f1-score   support

           0       0.79      0.72      0.75      6990
           1       0.70      0.77      0.73      5827

    accuracy                           0.74     12817
   macro avg       0.74      0.74      0.74     12817
weighted avg       0.75      0.74      0.74     12817

roc_auc_score: 0.7446896851590924


In [8]:
depth = round(sqrt(len(list(X))))
rf = RandomForestClassifier(n_estimators=100, max_depth=depth)
rf.fit(x_train, y_train)
y_predicted = rf.predict(x_test)
print(classification_report(y_predicted, y_test))
print(f'roc_auc_score: {roc_auc_score(y_predicted, y_test)}')

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      6412
           1       0.88      0.88      0.88      6405

    accuracy                           0.88     12817
   macro avg       0.88      0.88      0.88     12817
weighted avg       0.88      0.88      0.88     12817

roc_auc_score: 0.8822653221930192


In [7]:
knn = KNeighborsClassifier(n_neighbors=5)
log_reg.fit(x_train, y_train)
y_predicted = log_reg.predict(x_test)
print(classification_report(y_predicted, y_test))
print(f'roc_auc_score: {roc_auc_score(y_predicted, y_test)}')

              precision    recall  f1-score   support

           0       0.79      0.72      0.75      6990
           1       0.70      0.77      0.73      5827

    accuracy                           0.74     12817
   macro avg       0.74      0.74      0.74     12817
weighted avg       0.75      0.74      0.74     12817

roc_auc_score: 0.7446896851590924


According to the classification metrics, the Random Forest algorithm showed the best result