<h1><center>Home Credit Risk Prediction</center></h1>
<center> - sections 7/8 - </center>
<center>December 2024</center>
<center>Celine Ng</center>

# Table of Contents

1. Project Introduction
    1. Notebook Preparation
    1. Data loading
1. Main Data Preparation
    1. Data cleaning
    1. Dataframes and keys
    1. Train Test Split
    1. Quick EDA
        1. Keys present in each table
        1. Distribution
    1. Aggregation
1. Initial Data Cleaning
    1. Datatypes
    1. Missing values
1. EDA
    1. Original Application Table Distribution
    1. Correlation
    1. Statistical Inference
1. Data Preprocessing
1. Feature Engineering
    1. Baseline Model
    1. New Features
    1. More new features
1. Models
    1. Pipeline
    1. Model Selection
    1. Test Data
1. Final Model
    1. Final Model
    1. Deployment
    1. Model Interpretation
1. Improvements

In [3]:
%%capture
%pip install -r requirements.txt

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import os

from utils.custom_preprocessor import *
from utils.model import *

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import (OrdinalEncoder, FunctionTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import optuna
from scipy.sparse import csr_matrix
import shap

ModuleNotFoundError: No module named 'lightgbm'

***

**Variables & Data from previous notebooks**

Load aggregated main table

In [None]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
folder = os.path.join(project_root, "aggregated_data")
data_path = os.path.join(folder, "data_merged.pkl")
data = pd.read_pickle(data_path)

Data

In [None]:
ml_data = data.copy()
ml_data_sample = ml_data.sample(n=10, random_state=42)

target = 'TARGET'
binary_columns =['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
                 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER']
categorical_columns = ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE']

X, y = (ml_data.drop(columns=[target]).copy(),
        ml_data[target].reset_index(drop=True))

Preprocessor Encoding

In [None]:
numerical_binary_columns = (ml_data_sample[binary_columns].select_dtypes
                            (include='number'))
object_binary_columns = [col for col in binary_columns if col not in numerical_binary_columns]

In [None]:
preprocessor_encode = ColumnTransformer(
    transformers=[
        ('binary_encode', OrdinalEncoder(), object_binary_columns),
        ('freq_encode', FrequencyEncoder(), categorical_columns),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

***

# 7. Model Selection
Objective: Compare and find the best model, hyperparameters, and decision
threshold for final model training and deployment

## 7.1. Pipelines

In [None]:
ml_data = data.copy()
ml_data_sample = ml_data.sample(n=10, random_state=42)
target = 'TARGET'

X, y = (ml_data.drop(columns=[target]).copy(),
        ml_data[target].reset_index(drop=True))

**Define Models**

In [None]:
clf_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric = 'auc',
    random_state=42,
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
)

clf_lgbm = lgb.LGBMClassifier(
    objective='binary:logistic',
    eval_metric = 'auc',
    random_state=42,
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
)

clf_rf = RandomForestClassifier(
    n_estimators = 100,
    class_weight = 'balanced',
    random_state = 42,
    n_jobs = -1
)

**Define Pipeline**

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_xgb)
])

lgb_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_lgbm)
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_rf)
])

In [1]:
xgb_pipeline.fit(X, y)

NameError: name 'xgb_pipeline' is not defined

# Train pipeline
preprocessing_pipeline.fit(X_train, y_train)

# Save pipeline
import joblib
joblib.dump(preprocessing_pipeline, 'credit_risk_pipeline.joblib')

# Load and predict (in production)
loaded_pipeline = joblib.load('credit_risk_pipeline.joblib')
predictions = loaded_pipeline.predict(new_data)

Hyperparameter tuning with stratified k fold

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np

# Define the Optuna objective function with Stratified K-Fold
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.8),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': scale_pos_weight,  # Use precomputed weight
        'random_state': 42,
    }

    # Update clf_xgb parameters with sampled hyperparameters
    clf_xgb.set_params(**params)

    # Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in skf.split(X_train_tf, y_train):
        # Split the data into train and validation sets
        X_train_fold, X_val_fold = X_train_tf.iloc[train_idx], X_train_tf.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train the model
        clf_xgb.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            early_stopping_rounds=10,  # Use early stopping
            verbose=False
        )

        # Predict probabilities for validation fold
        y_prob = clf_xgb.predict_proba(X_val_fold)[:, 1]

        # Compute AUC for the fold
        auc_scores.append(roc_auc_score(y_val_fold, y_prob))

    # Return the mean AUC across all folds
    return np.mean(auc_scores)

# Create and run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and score
print("Best Parameters:", study.best_params)
print("Best ROC AUC:", study.best_value)

1. preprocessing steps ={
feature creation (BUREAU_ID, PREV_ID, other domain knowledge based features)
encoding (columns from main table contain object dtypes)
}
2. feature selection if too slow -> model train  -> hyperparameter (select
depth
 to remove useless features) -> train/test cross validation, metrics
3. repeat step 2 for all models used (random forest with class weight,
imblearn random forest, xgboost with pos scale weight....?)

# Create pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor_encode),
    ('feature_creator', FeatureCreation2()),
    ('model', clf_xgb)
])

# Train pipeline
preprocessing_pipeline.fit(X_train, y_train)

# Save pipeline
import joblib
joblib.dump(preprocessing_pipeline, 'credit_risk_pipeline.joblib')

# Load and predict (in production)
loaded_pipeline = joblib.load('credit_risk_pipeline.joblib')
predictions = loaded_pipeline.predict(new_data)

# Improvements

1.