In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Hyperparameter optimization
import optuna

# Utility
import pickle
import warnings

In [2]:
# Set up random seed
random_seed = 42

# Load the dataset
df = pd.read_csv('loan_data.csv')

df

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,LoanAmount,LoanDuration,EducationLevel,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,DebtToIncomeRatio,BankruptcyHistory,PreviousLoanDefaults,PaymentHistory,LengthOfCreditHistory,LoanPurpose,RiskScore,LoanApproved
0,45,39948,617,Employed,13152,48,Master,2,183,0.354418,0.358336,0,0,29,9,Home,49.0,0
1,38,39709,628,Employed,26045,48,Associate,1,496,0.087827,0.330274,0,0,21,9,Debt Consolidation,52.0,0
2,47,40724,570,Employed,17627,36,Bachelor,2,902,0.137414,0.244729,0,0,20,22,Education,52.0,0
3,58,69084,545,Employed,37898,96,High School,1,755,0.267587,0.436244,0,0,27,10,Home,54.0,0
4,37,103264,594,Employed,9184,36,Associate,1,274,0.320535,0.078884,0,0,26,27,Debt Consolidation,36.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,44,30180,587,Employed,24521,36,High School,3,673,0.101288,0.468077,0,0,21,7,Auto,55.0,0
19996,56,49246,567,Employed,25818,36,Associate,5,414,0.471818,0.317372,0,0,30,28,Debt Consolidation,54.0,0
19997,44,48958,645,Employed,37033,72,Bachelor,3,512,0.216596,0.023014,0,0,28,8,Home,45.0,0
19998,60,41025,560,Employed,14760,72,High School,3,986,0.364758,0.534517,0,0,17,13,Debt Consolidation,59.0,0


In [3]:
# Separate the target columns from the rest of the dataframe
target_cols = ['RiskScore', 'LoanApproved']
df_features = df.drop(columns=target_cols)

# Identify the categorical and numerical columns
categorical_cols = ['EmploymentStatus', 'EducationLevel', 'LoanPurpose']
numerical_cols = df_features.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),  
        ('num', StandardScaler(), numerical_cols)  
    ]
)

# Apply the preprocessing to the features
df_encoded = preprocessor.fit_transform(df_features)
df_encoded = pd.DataFrame(df_encoded, columns=preprocessor.get_feature_names_out())

# Combine the preprocessed features with the target columns
df_final = pd.concat([df_encoded, df[target_cols]], axis=1)

df_final

Unnamed: 0,cat__EmploymentStatus_Employed,cat__EmploymentStatus_Self-Employed,cat__EmploymentStatus_Unemployed,cat__EducationLevel_Associate,cat__EducationLevel_Bachelor,cat__EducationLevel_Doctorate,cat__EducationLevel_High School,cat__EducationLevel_Master,cat__LoanPurpose_Auto,cat__LoanPurpose_Debt Consolidation,...,num__NumberOfDependents,num__MonthlyDebtPayments,num__CreditCardUtilizationRate,num__DebtToIncomeRatio,num__BankruptcyHistory,num__PreviousLoanDefaults,num__PaymentHistory,num__LengthOfCreditHistory,RiskScore,LoanApproved
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.348195,-1.128029,0.425792,0.453170,-0.235154,-0.333426,1.012343,-0.711630,49.0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.373154,0.173418,-1.242602,0.278010,-0.235154,-0.333426,-0.605351,-0.711630,52.0,0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.348195,1.861556,-0.932273,-0.255954,-0.235154,-0.333426,-0.807563,0.841287,52.0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.373154,1.250334,-0.117617,0.939471,-0.235154,-0.333426,0.607919,-0.592175,54.0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.373154,-0.749653,0.213746,-1.291147,-0.235154,-0.333426,0.405707,1.438563,36.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.069545,0.909380,-1.158361,1.138169,-0.235154,-0.333426,-0.605351,-0.950540,55.0,0
19996,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.512244,-0.167536,1.160510,0.197477,-0.235154,-0.333426,1.214554,1.558018,54.0,0
19997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.069545,0.239946,-0.436731,-1.639886,-0.235154,-0.333426,0.810131,-0.831085,45.0,0
19998,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.069545,2.210826,0.490501,1.552883,-0.235154,-0.333426,-1.414198,-0.233810,59.0,0


In [4]:
# Prepare the feature and target variables for risk prediction
X = df_final.drop(['RiskScore', 'LoanApproved'], axis=1)  
y = df_final['RiskScore']  

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [5]:
# Define the objective function for Optuna 
def objective(trial):
    # Hyperparameter search space for regression
    param = {
        'predictor': 'gpu_predictor',  
        'objective': 'reg:squarederror',  
        'eval_metric': 'rmse',  
        'verbosity': 0,
        'enable_categorical': True,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': random_seed,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 10)
    }

    # Initialize KFold cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=random_seed)

    # Store RMSE scores for each fold
    rmse_scores = []

    # KFold cross-validation loop
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Initialize and train the model
        model = xgb.XGBRegressor(**param)
        model.fit(X_train_cv, y_train_cv, 
                  eval_set=[(X_val_cv, y_val_cv)],
                  verbose=False)

        # Predict and calculate RMSE for the fold
        y_pred = model.predict(X_val_cv)
        rmse = np.sqrt(mean_squared_error(y_val_cv, y_pred))  
        rmse_scores.append(rmse)

    # Return the average RMSE from the KFold validation
    mean_rmse = np.mean(rmse_scores)
    return mean_rmse

# Create an Optuna study for hyperparameter optimization
study = optuna.create_study(direction='minimize')  # Minimizing RMSE
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Best parameters
best_trial = study.best_trial
print(f"Best trial parameters: {best_trial.params}")

[I 2025-10-24 15:55:17,236] A new study created in memory with name: no-name-58acadc4-ca1e-4462-8ffb-abee77340786


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-24 15:55:30,383] Trial 0 finished with value: 3.755377770724487 and parameters: {'learning_rate': 0.015230051438550234, 'n_estimators': 2340, 'max_depth': 3, 'subsample': 0.7008932516395802, 'colsample_bytree': 0.6337325706119747, 'reg_alpha': 2.013704895896481, 'reg_lambda': 8.972264904186606, 'min_child_weight': 8, 'gamma': 1.0143595712226028, 'max_delta_step': 2}. Best is trial 0 with value: 3.755377770724487.
[I 2025-10-24 15:55:42,534] Trial 1 finished with value: 4.0945190660827535 and parameters: {'learning_rate': 0.2492820226530934, 'n_estimators': 2145, 'max_depth': 13, 'subsample': 0.6862932374217972, 'colsample_bytree': 0.6945474998045812, 'reg_alpha': 6.157117412388233, 'reg_lambda': 1.5842175778315482, 'min_child_weight': 1, 'gamma': 2.934684065285394, 'max_delta_step': 2}. Best is trial 0 with value: 3.755377770724487.
[I 2025-10-24 15:55:53,807] Trial 2 finished with value: 3.899786906182091 and parameters: {'learning_rate': 0.09083594492889865, 'n_estimators'

In [6]:
def regression_metrics(y_test, y_test_pred, digits=3):
    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_test_pred)
    
    print("Regression Metrics:")
    print(f"Mean Absolute Error: {mae:.{digits}f}")
    print(f"Mean Squared Error: {mse:.{digits}f}")
    print(f"Root Mean Squared Error: {rmse:.{digits}f}")
    print(f"R2 Score: {r2:.{digits}f}")

In [7]:
# Create the final pipeline for risk prediction
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(**best_trial.params)) 
])

# Prepare the feature and target variables for risk prediction
X = df.drop(['RiskScore', 'LoanApproved'], axis=1)  
y = df['RiskScore']  # Predicting RiskScore

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Train the final model
final_pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_test_pred = final_pipeline.predict(X_test)
regression_metrics(y_test, y_test_pred)

Regression Metrics:
Mean Absolute Error: 2.801
Mean Squared Error: 14.315
Root Mean Squared Error: 3.784
R2 Score: 0.769


In [8]:
# Save the trained model
pickle.dump(final_pipeline, open('risk_predictor.pkl', 'wb'))

# Load the trained model
loaded_model = pickle.load(open('risk_predictor.pkl', 'rb'))

# Use the loaded model to make predictions
y_pred_loaded = loaded_model.predict(X_test)
regression_metrics(y_test, y_pred_loaded)

Regression Metrics:
Mean Absolute Error: 2.801
Mean Squared Error: 14.315
Root Mean Squared Error: 3.784
R2 Score: 0.769
