In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load Preprocessed Data
df = pd.read_csv("data/preprocessed_loan_data.csv")
display(df.head())

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,128027,0.83399,0.089693,-1.086833,-0.341492,0.590533,1.341937,0.261771,-0.001526,-0.260753,0,0,0,1,1,4,1,-0.362472
1,125442,1.701221,-0.823021,-0.044309,-0.731666,-1.285731,-1.343791,-1.30835,1.412793,0.778585,2,0,1,0,0,4,1,-0.362472
2,85333,0.166888,0.043854,0.022715,-0.775718,-0.968209,0.446694,1.156831,-0.708685,-0.823728,2,3,0,1,1,0,0,2.758835
3,220129,-0.767053,-1.303452,-1.168538,1.061875,-1.718715,0.446694,-0.967805,-0.708685,-1.170174,1,0,1,0,0,1,0,-0.362472
4,105746,1.10083,-1.592855,-1.671921,0.369631,-1.48779,1.341937,-1.052188,0.705634,0.995114,0,3,0,0,1,0,0,-0.362472


In [2]:
# Drop LoanID (Non-Predictive Feature)
df.drop(columns=['LoanID'], inplace=True)

# Ensure Default is Binary (0 or 1)
df['Default'] = (df['Default'] > 0).astype(int)

# Define Features and Target
X = df.drop(columns=['Default'])  # 'Default' is the target column
y = df['Default']

In [3]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and Evaluate Models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    results[name] = {'Accuracy': accuracy, 'ROC AUC': roc_auc}
    print(f"\n{name} Model Performance:")
    print(classification_report(y_test, y_pred))



Logistic Regression Model Performance:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45139
           1       0.61      0.03      0.06      5931

    accuracy                           0.89     51070
   macro avg       0.75      0.51      0.50     51070
weighted avg       0.85      0.89      0.84     51070


Random Forest Model Performance:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45139
           1       0.60      0.05      0.09      5931

    accuracy                           0.89     51070
   macro avg       0.74      0.52      0.51     51070
weighted avg       0.85      0.89      0.84     51070



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Model Performance:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45139
           1       0.53      0.08      0.14      5931

    accuracy                           0.88     51070
   macro avg       0.71      0.54      0.54     51070
weighted avg       0.85      0.88      0.85     51070



In [None]:
# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100],  # Reduce estimators
    'max_depth': [None, 10],    # Reduce depth options
    'min_samples_split': [2, 5] # Reduce splits
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid, cv=3, scoring='roc_auc', n_jobs=-1)  # Reduce CV folds

grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Save Best Model & Scaler
joblib.dump(best_rf, "models/credit_risk_model.pkl")
joblib.dump(StandardScaler(), "models/scaler.pkl")

print("Model training complete. Best model saved.")
