<h1><center>Home Credit Risk Prediction</center></h1>
<center> - sections 7/8 - </center>
<center>December 2024</center>
<center>Celine Ng</center>

# Table of Contents

1. Project Introduction
    1. Notebook Preparation
    1. Data loading
1. Main Data Preparation
    1. Data cleaning
    1. Dataframes and keys
    1. Train Test Split
    1. Quick EDA
        1. Keys present in each table
        1. Distribution
    1. Aggregation
1. Initial Data Cleaning
    1. Datatypes
    1. Missing values
1. EDA
    1. Original Application Table Distribution
    1. Correlation
    1. Statistical Inference
1. Data Preprocessing
1. Feature Engineering
    1. Baseline Model
    1. New Features
    1. More new features
1. Models
    1. Pipeline
    1. Model Selection
    1. Test Data
1. Final Model
    1. Final Model
    1. Deployment
    1. Model Interpretation
1. Improvements

In [1]:
%%capture
%pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import os

from utils.custom_preprocessor import *
from utils.model import *

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import (OrdinalEncoder, FunctionTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import optuna
from scipy.sparse import csr_matrix
import shap

  from .autonotebook import tqdm as notebook_tqdm


***

**Variables & Data from previous notebooks**

Load aggregated main table

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
folder = os.path.join(project_root, "aggregated_data")
data_path = os.path.join(folder, "data_merged.pkl")
data = pd.read_pickle(data_path)

Data

In [4]:
ml_data = data.copy()
ml_data_sample = ml_data.sample(n=10, random_state=42)

target = 'TARGET'
binary_columns =['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
                 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER']
categorical_columns = ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE']

X, y = (ml_data.drop(columns=[target]).copy(),
        ml_data[target].reset_index(drop=True))

Preprocessor Encoding

In [5]:
numerical_binary_columns = (ml_data_sample[binary_columns].select_dtypes
                            (include='number'))
object_binary_columns = [col for col in binary_columns if col not in numerical_binary_columns]

In [6]:
preprocessor_encode = ColumnTransformer(
    transformers=[
        ('binary_encode', OrdinalEncoder(), object_binary_columns),
        ('freq_encode', FrequencyEncoder(), categorical_columns),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

***

# 7. Model Selection
Objective: Compare and find the best model, hyperparameters, and decision
threshold for final model training and deployment

## 7.1. Training

In [7]:
ml_data = data.copy()
ml_data_sample = ml_data.sample(n=10, random_state=42)
target = 'TARGET'

X, y = (ml_data.drop(columns=[target]).copy(),
        ml_data[target].reset_index(drop=True))

**Define Models**

In [8]:
clf_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric = 'auc',
    random_state=42,
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
)

clf_lgbm = lgb.LGBMClassifier(
    objective='binary',
    eval_metric = 'auc',
    random_state=42,
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
)

clf_rf = RandomForestClassifier(
    n_estimators = 100,
    class_weight = 'balanced',
    random_state = 42,
    n_jobs = -1
)

**Define Pipeline**

In [9]:
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_xgb)
])

lgb_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_lgbm)
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_rf)
])

Constructing pipelines is a more robust and scalable method, due
to time constraints and existent bugs, manual training will be applied for
this project.

In [12]:
results_folder = os.path.join("..", "results")
results_file = os.path.join(results_folder, "modes_results.pkl")
os.makedirs(results_folder, exist_ok=True)

if os.path.exists(results_file):
    with open(results_file, "rb") as f:
        results = pkl.load(f)
    print(f"Results loaded from {results_file}")
else:
    results = {'XGB': [], 'LGBM': [], 'RF': []}

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        X_train_processed = pd.DataFrame(preprocessor_encode.fit_transform(X_train), columns=preprocessor_encode.get_feature_names_out())
        X_test_processed = pd.DataFrame(preprocessor_encode.transform(X_test), columns=preprocessor_encode.get_feature_names_out())

        fc = FeatureCreation2()
        X_train_processed = fc.fit_transform(X_train_processed)
        X_test_processed = fc.transform(X_test_processed)

        for model_name, clf in [('XGB', clf_xgb), ('LGBM', clf_lgbm), ('RF', clf_rf)]:
            clf.fit(X_train_processed, y_train)
            y_pred = clf.predict_proba(X_test_processed)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred)
            results[model_name].append(auc_score)

    # Serialize results
    with open(results_file, "wb") as f:
        pkl.dump(results, f)
    print(f"Results saved to {results_file}")

# Display results
for model_name, scores in results.items():
    print(f"{model_name}: Mean AUC = {sum(scores)/len(scores):.4f}, Scores = {scores}")

Results loaded from ../results/modes_results.pkl
XGB: Mean AUC = 0.7570, Scores = [np.float64(0.7569814759028433), np.float64(0.7570831883320318), np.float64(0.7602751456748199), np.float64(0.7583395348504778), np.float64(0.752283294966421)]
LGBM: Mean AUC = 0.7785, Scores = [np.float64(0.7771454069710615), np.float64(0.7762573421868177), np.float64(0.7822009761454211), np.float64(0.7794726531741337), np.float64(0.7774387269955776)]
RF: Mean AUC = 0.7253, Scores = [np.float64(0.7253180788624507), np.float64(0.7218548967245626), np.float64(0.7313602385613733), np.float64(0.7252500684833187), np.float64(0.7228668706994196)]


Comparing the scores across 3 models, with stratified k fold, and the whole
training set, it is clear that LightGBM is the best model in all folds and
in general. The score achieved by LightGBM is close to the score achieved by
 XGBoost after hyperparameter tuning from our previous experience. However,
 we will proceed with hyperparameter tuning just to confirm the results.

## 7.2. Hyperparameter Tuning

Cross validation is not used at this step due to memory limitation

**Define hyperparameter intervals for tuning**

In [15]:
# XGBoost
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'random_state': 42,
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train_processed, y_train):
        X_t, X_v = X_train_processed.iloc[train_idx], X_train_processed.iloc[val_idx]
        y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]

        dtrain = xgb.DMatrix(data=X_t, label=y_t)
        dval = xgb.DMatrix(data=X_v, label=y_v)

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=500,
            evals=[(dval, 'validation')],
            early_stopping_rounds=15,
            verbose_eval=False
        )

        y_pred_proba = model.predict(dval)
        auc_scores.append(roc_auc_score(y_v, y_pred_proba))

    return sum(auc_scores) / len(auc_scores)

# LightGBM
def objective_lgbm(trial):
    params = {
        'objective': 'binary',
        'random_state': 42,
        'verbosity': -1,
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10)
    }

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train_processed, y_train):
        X_t, X_v = X_train_processed.iloc[train_idx], X_train_processed.iloc[val_idx]
        y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        # Use callbacks for early stopping
        model.fit(
            X_t, y_t,
            eval_set=[(X_v, y_v)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(stopping_rounds=15)]
        )
        y_pred_proba = model.predict_proba(X_v)[:, 1]
        auc_scores.append(roc_auc_score(y_v, y_pred_proba))

    return sum(auc_scores) / len(auc_scores)


def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.6, 0.8),
        'random_state': 42,
        'class_weight': 'balanced'
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train_processed, y_train)
    y_pred_proba = model.predict_proba(X_test_processed)[:, 1]
    return roc_auc_score(y_test, y_pred_proba)

**Tune and serialize best parameters**

In [16]:
hyperparameters_folder = os.path.join("..", "hyperparameters")
os.makedirs(hyperparameters_folder, exist_ok=True)
studies = {}

for model_name, objective in zip(["XGB", "LGBM", "RF"], [objective_xgb,
                                                   objective_lgbm, objective_rf]):
    # Dynamically construct the file path for each study
    study_file = os.path.join(hyperparameters_folder, f"{model_name.lower()}_study.pkl")

    if os.path.exists(study_file):
        # Load existing study
        try:
            with open(study_file, "rb") as f:
                studies[model_name] = pkl.load(f)
            print(f"Loaded existing study for {model_name} from: {study_file}")
        except (pkl.UnpicklingError, EOFError, FileNotFoundError) as e:
            print(f"Failed to load study for {model_name}: {e}")
            studies[model_name] = None
    else:
        # Create a new study
        print(f"No study file found for {model_name}. Starting new study...")
        studies[model_name] = optuna.create_study(direction='maximize')
        studies[model_name].optimize(locals()[f"objective_{model_name.lower()}"], n_trials=50)

        # Save the new study
        with open(study_file, "wb") as f:
            pkl.dump(studies[model_name], f)
        print(f"Saved new study for {model_name} to: {study_file}")

for model_name, study in studies.items():
    if study is not None:
        print(f"{model_name} Best Parameters: {study.best_params}")
        print(f"{model_name} Best ROC AUC: {study.best_value:.4f}")
    else:
        print(f"No valid study for {model_name}.")


[I 2024-12-30 09:39:21,189] A new study created in memory with name: no-name-319ad94e-a853-4ad7-b71c-45f3fc34514f


Loaded existing study for XGB from: ../hyperparameters/xgb_study.pkl
Loaded existing study for LGBM from: ../hyperparameters/lgbm_study.pkl
No study file found for RF. Starting new study...


[I 2024-12-30 10:45:54,200] Trial 0 finished with value: 0.7313615883815494 and parameters: {'n_estimators': 364, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.7338805815006356}. Best is trial 0 with value: 0.7313615883815494.
[I 2024-12-30 11:03:37,896] Trial 1 finished with value: 0.7195485494414917 and parameters: {'n_estimators': 242, 'max_depth': 2, 'min_samples_split': 17, 'min_samples_leaf': 2, 'max_features': 0.6037845724393989}. Best is trial 0 with value: 0.7313615883815494.
[I 2024-12-30 12:53:03,549] Trial 2 finished with value: 0.7486494061125201 and parameters: {'n_estimators': 275, 'max_depth': 8, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 0.7762326417089551}. Best is trial 2 with value: 0.7486494061125201.
[I 2024-12-30 13:30:18,225] Trial 3 finished with value: 0.7371370403124771 and parameters: {'n_estimators': 164, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 0.7207396147249875}

KeyboardInterrupt: 

LightGBM did way better with default settings. There must be a range which
optimizes AUC score, however, for now, we still t

# Train pipeline
preprocessing_pipeline.fit(X_train, y_train)

# Save pipeline
import joblib
joblib.dump(preprocessing_pipeline, 'credit_risk_pipeline.joblib')

# Load and predict (in production)
loaded_pipeline = joblib.load('credit_risk_pipeline.joblib')
predictions = loaded_pipeline.predict(new_data)

Hyperparameter tuning with stratified k fold

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np

# Define the Optuna objective function with Stratified K-Fold
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.8),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': scale_pos_weight,  # Use precomputed weight
        'random_state': 42,
    }

    # Update clf_xgb parameters with sampled hyperparameters
    clf_xgb.set_params(**params)

    # Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in skf.split(X_train_tf, y_train):
        # Split the data into train and validation sets
        X_train_fold, X_val_fold = X_train_tf.iloc[train_idx], X_train_tf.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train the model
        clf_xgb.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            early_stopping_rounds=10,  # Use early stopping
            verbose=False
        )

        # Predict probabilities for validation fold
        y_prob = clf_xgb.predict_proba(X_val_fold)[:, 1]

        # Compute AUC for the fold
        auc_scores.append(roc_auc_score(y_val_fold, y_prob))

    # Return the mean AUC across all folds
    return np.mean(auc_scores)

# Create and run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and score
print("Best Parameters:", study.best_params)
print("Best ROC AUC:", study.best_value)

1. preprocessing steps ={
feature creation (BUREAU_ID, PREV_ID, other domain knowledge based features)
encoding (columns from main table contain object dtypes)
}
2. feature selection if too slow -> model train  -> hyperparameter (select
depth
 to remove useless features) -> train/test cross validation, metrics
3. repeat step 2 for all models used (random forest with class weight,
imblearn random forest, xgboost with pos scale weight....?)

# Create pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_xgb)
])

# Train pipeline
preprocessing_pipeline.fit(X_train, y_train)

# Save pipeline
import joblib
joblib.dump(preprocessing_pipeline, 'credit_risk_pipeline.joblib')

# Load and predict (in production)
loaded_pipeline = joblib.load('credit_risk_pipeline.joblib')
predictions = loaded_pipeline.predict(new_data)

# Improvements

1.