<a href="https://colab.research.google.com/github/Benetti-Hub/Benetti-Hub-Kaggle-Home-Credit-Risk/blob/main/2_model_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Generation with an Ensemble of LightGBM Models

To tackle the challenge of creating a robust predictive model, I leveraged an ensemble of LightGBM models. Ensemble learning combines the predictions from multiple models to enhance the overall performance and generalizability. In this approach, each LightGBM model was trained on different subsets of the data or with varying hyperparameters, allowing the ensemble to capture diverse patterns and relationships within the dataset.

The ensemble method effectively mitigates the risk of overfitting by averaging out the biases of individual models. Additionally, LightGBM, known for its efficiency and accuracy with large datasets, facilitated swift experimentation with numerous configurations. By aggregating the predictions of these models, the ensemble achieved superior predictive power compared to any single model. This technique ensured that the final predictions were both accurate and reliable, demonstrating the strength of collaborative model generation.

Note that I choose to only use LightGBM for the sake of simplicity, the best results when using blending is to create a mix of different models (GBM, NN, RF, ...) each using different set of features

Additionally, this model is built for a Kaggle competition. In a real-world scenario, I would incorporate a more extensive validation process and consider additional factors such as interpretability, scalability, and deployment constraints. This would ensure the model is not only highly accurate but also practical and reliable for real-time applications. By continually refining and testing the ensemble approach, the model can be adapted to meet the specific demands and challenges of various predictive tasks.

In [1]:
%%capture
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd

In [2]:
import gc
import os
import pickle
import re

import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from google.colab import drive
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


In [3]:
drive.mount("/content/drive")

# Setup the directory for the notebook:
DRIVE_PATH = "/content/drive/MyDrive/Projects/KaggleDefaults"
os.makedirs(DRIVE_PATH, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def stability_metric(
    predictions,
    w_fallingrate=88.0,
    w_resstd=-0.5,
):
    """Stability metric for the competition"""
    gini = (
        predictions.groupby("WEEK_NUM")
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .to_list()
    )
    x = np.arange(len(gini))
    y = gini
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini)

    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

def cast_cat(data: pd.DataFrame) -> pd.DataFrame:
    """Cast any object column to category"""
    cat_cols = data.select_dtypes(include="object").columns
    data[cat_cols] = data[cat_cols].astype("category")

    return data

def remove_prefix_from_list(strings_list):
    # Define the regex pattern to match the prefixes
    pattern = re.compile(r'^(sum_|count_|mean_|min_|first_|last_|max_|mode_|std_|ks_|quantile_.{4}_|imq_.{4}_)')

    # Use list comprehension to remove the prefixes from each string in the list
    updated_list = [pattern.sub('', s) for s in strings_list]
    updated_list = [pattern.sub('', s) for s in updated_list]

    return updated_list

In [5]:
with open(f"{DRIVE_PATH}/to_keep.pkl", "rb") as f:
    to_keep = pickle.load(f)

target_col = "target"

In [6]:
train_data = f'{DRIVE_PATH}/train_data_pandas.parquet'

X = pd.read_parquet(train_data, columns=to_keep).pipe(cast_cat)
y = pl.scan_parquet(train_data).select(target_col).collect().to_pandas().squeeze()
weeks = pl.scan_parquet(train_data).select("WEEK_NUM").collect().to_pandas()

In [7]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "max_depth": 10,
    "learning_rate": 0.05,
    "max_bin": 63,
    "n_estimators": 1000,
    "colsample_bytree": 0.9,
    "colsample_bynode": 0.9,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 3.25,
    "extra_trees": True,
    "device": "gpu",
    "importance_type": "gain",
    "metric": "auc"
}

roc_auc_cv = np.zeros(cv.get_n_splits())
gini_falling = np.zeros(cv.get_n_splits())
fitted_models = []
for i, (idx_train, idx_valid) in tqdm(enumerate(cv.split(X, y, groups=weeks))):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid), (X_train, y_train)],
        callbacks=[
            lgb.log_evaluation(100),
            lgb.early_stopping(100, first_metric_only=True),
        ],
    )
    predictions = pd.DataFrame(
        {
            "WEEK_NUM": weeks.iloc[idx_valid].squeeze(),
            "target": y_valid,
            "score": model.predict_proba(X_valid)[:, 1],
        }
    )
    roc_auc_cv[i] = roc_auc_score(predictions["target"], predictions["score"])
    gini_falling[i] = stability_metric(predictions)
    fitted_models.append(model)

    del X_train, y_train, X_valid, y_valid
    gc.collect()


oof_models_dict = [(str(i), model) for i, model in enumerate(fitted_models)]

model = VotingClassifier(
    estimators=oof_models_dict,
    voting="soft",
)
model.estimators_ = fitted_models
model.le_ = LabelEncoder().fit(y)
model.classes_ = model.le_.classes_

del X, y, weeks
gc.collect()

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.841493	valid_0's auc: 0.832074
[200]	training's auc: 0.854486	valid_0's auc: 0.840919
[300]	training's auc: 0.862159	valid_0's auc: 0.844141
[400]	training's auc: 0.868005	valid_0's auc: 0.845682
[500]	training's auc: 0.87278	valid_0's auc: 0.846478
[600]	training's auc: 0.877129	valid_0's auc: 0.846916
[700]	training's auc: 0.880808	valid_0's auc: 0.847128
[800]	training's auc: 0.884517	valid_0's auc: 0.847441
[900]	training's auc: 0.887928	valid_0's auc: 0.84769
[1000]	training's auc: 0.890999	valid_0's auc: 0.847872
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.890999	valid_0's auc: 0.847872
Evaluated only: auc


1it [03:29, 209.07s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.84144	valid_0's auc: 0.832208
[200]	training's auc: 0.854297	valid_0's auc: 0.840113
[300]	training's auc: 0.862061	valid_0's auc: 0.843237
[400]	training's auc: 0.867722	valid_0's auc: 0.844534
[500]	training's auc: 0.872647	valid_0's auc: 0.845225
[600]	training's auc: 0.876765	valid_0's auc: 0.845707
[700]	training's auc: 0.880596	valid_0's auc: 0.846074
[800]	training's auc: 0.88401	valid_0's auc: 0.84642
[900]	training's auc: 0.887219	valid_0's auc: 0.84664
[1000]	training's auc: 0.890442	valid_0's auc: 0.84679
Did not meet early stopping. Best iteration is:
[990]	training's auc: 0.890116	valid_0's auc: 0.846792
Evaluated only: auc


2it [07:01, 211.21s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.84067	valid_0's auc: 0.8357
[200]	training's auc: 0.853728	valid_0's auc: 0.844691
[300]	training's auc: 0.861417	valid_0's auc: 0.848188
[400]	training's auc: 0.867283	valid_0's auc: 0.849866
[500]	training's auc: 0.872	valid_0's auc: 0.850729
[600]	training's auc: 0.876256	valid_0's auc: 0.8513
[700]	training's auc: 0.879879	valid_0's auc: 0.851534
[800]	training's auc: 0.883381	valid_0's auc: 0.851779
[900]	training's auc: 0.886541	valid_0's auc: 0.851987
[1000]	training's auc: 0.889657	valid_0's auc: 0.852117
Did not meet early stopping. Best iteration is:
[990]	training's auc: 0.889401	valid_0's auc: 0.852129
Evaluated only: auc


3it [10:34, 211.74s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.840152	valid_0's auc: 0.839426
[200]	training's auc: 0.852928	valid_0's auc: 0.847639
[300]	training's auc: 0.860552	valid_0's auc: 0.850926
[400]	training's auc: 0.866422	valid_0's auc: 0.85263
[500]	training's auc: 0.871255	valid_0's auc: 0.853448
[600]	training's auc: 0.875415	valid_0's auc: 0.854076
[700]	training's auc: 0.879219	valid_0's auc: 0.854424
[800]	training's auc: 0.882751	valid_0's auc: 0.854783
[900]	training's auc: 0.886165	valid_0's auc: 0.854903
[1000]	training's auc: 0.889276	valid_0's auc: 0.855118
Did not meet early stopping. Best iteration is:
[997]	training's auc: 0.889177	valid_0's auc: 0.855128
Evaluated only: auc


4it [14:02, 210.43s/it]

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.841541	valid_0's auc: 0.82864
[200]	training's auc: 0.854559	valid_0's auc: 0.837953
[300]	training's auc: 0.862233	valid_0's auc: 0.841687
[400]	training's auc: 0.868058	valid_0's auc: 0.843491
[500]	training's auc: 0.872713	valid_0's auc: 0.843999
[600]	training's auc: 0.876832	valid_0's auc: 0.844532
[700]	training's auc: 0.880526	valid_0's auc: 0.844837
[800]	training's auc: 0.883876	valid_0's auc: 0.845144
[900]	training's auc: 0.886995	valid_0's auc: 0.845295
[1000]	training's auc: 0.890026	valid_0's auc: 0.845498
Did not meet early stopping. Best iteration is:
[993]	training's auc: 0.889803	valid_0's auc: 0.845527
Evaluated only: auc


5it [17:28, 209.73s/it]


0

In [8]:
f_imp = (
    pd.DataFrame(
        zip(
            fitted_models[0].booster_.feature_name(),
            remove_prefix_from_list(fitted_models[0].booster_.feature_name()),
            np.mean([model.feature_importances_ for model in fitted_models], axis=0),
        ),
        columns=['full_name', 'feature_origin', 'importance']
    )
    .sort_values(by='importance', ascending=False)
)
f_imp.to_csv(f"{DRIVE_PATH}/feature_importance.csv")
f_imp

Unnamed: 0,full_name,feature_origin,importance
18,mean_maxdpdtolerance_577P,maxdpdtolerance_577P,30198.206577
726,min_financialinstitution_591M,financialinstitution_591M,29632.847568
732,price_1097A,price_1097A,23311.469852
524,pmtnum_254L,pmtnum_254L,18568.669800
65,quantile_0.75_mean_pmts_dpd_303P,pmts_dpd_303P,14621.614316
...,...,...,...
703,imq_0.9_overdueamountmax2_14A,imq_0.9_overdueamountmax2_14A,18.067776
531,imq_0.9_quantile_0.90_pmts_overdue_1152A,imq_0.9_quantile_0.90_pmts_overdue_1152A,15.390542
519,imq_0.9_quantile_0.75_pmts_dpd_303P,imq_0.9_quantile_0.75_pmts_dpd_303P,15.353922
621,last_rejectreasonclient_4145042M,rejectreasonclient_4145042M,11.368604
