Notebook objective: Train & tune LightGBM model while ignoring the country and year grouping variables. The goal is to observe how much learning takes place compared to a GPBoost model with group random effects.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import optuna

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMRegressor, early_stopping

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Set print options
np.set_printoptions(suppress=True, precision=6, edgeitems = 7)
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('display.max_columns', None)

## Preprocessing

In [3]:
random_state = 1923

In [4]:
# Read data
df = pd.read_csv("./OutputData/training_data.csv")

In [6]:
# Split features and target
X = df.drop(["life_expectancy", "country", "year"], axis = 1)
y = df.life_expectancy

In [7]:
# Create CV splitter, to be stratified by country
G = df["country"]
cv = StratifiedKFold(n_splits = 4)

## Hyperparameter tuning

In [8]:
# Objective function
def objective_lgbm(trial):

    # Define hyperparameter space
    learning_rate = trial.suggest_float("learning_rate", 0.05, 0.5)
    num_leaves = trial.suggest_int("num_leaves", 2**2, 2**10)
    min_child_samples = trial.suggest_int("min_child_samples", 10, 1000, log = True)
    min_child_weight = trial.suggest_int("min_child_weight", 0.001, 20)
    reg_alpha = trial.suggest_float("l1_reg", 5e-5, 1, log = True)
    reg_lambda = trial.suggest_float("l2_reg", 0, 2)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.25, 1)

    # Store tuning scores & number of rounds
    scores = []
    rounds = []

    for i, (train_index, val_index) in enumerate(cv.split(X, G)):

        # Split train - val
        X_train, y_train = X.iloc[train_index, ], y[train_index]
        X_val, y_val = X.iloc[val_index, ], y[val_index]
    
        # Create model
        if i == 0:
            callbacks = [
                optuna.integration.LightGBMPruningCallback(trial, "l2"),
                early_stopping(50, verbose = False, min_delta = 0.01) # min_delta chosen as %0.1 of first validation score
            ]
        else:
            callbacks = [
                early_stopping(50, verbose = False, min_delta = 0.01) # min_delta chosen as %0.1 of first validation score
            ]
    
        model = LGBMRegressor(
            boosting_type = "gbdt",
            num_threads = 10,
            device_type = "gpu",
            random_state = random_state,
            n_estimators = 5000,
            verbosity = -1,
            learning_rate = learning_rate,
            num_leaves = num_leaves,
            min_child_samples = min_child_samples,
            min_child_weight = min_child_weight,
            reg_alpha = reg_alpha,
            reg_lambda = reg_lambda,
            subsample = subsample,
            subsample_freq = 1,
            colsample_bytree = colsample_bytree
        )

        # Train model with early stopping
        model.fit(
            X_train, 
            y_train, 
            eval_set = [(X_val, y_val)],
            callbacks = callbacks)

        # Record best number of rounds
        rounds.append(model.best_iteration_)

        # Record best score
        scores.append(model.best_score_['valid_0']['l2'])

    # Report mean number of rounds
    trial.set_user_attr("n_rounds", (np.mean(rounds)))
    
    return np.mean(scores)


In [9]:
# Create study
study_lgbm = optuna.create_study(
  sampler = optuna.samplers.TPESampler(seed = random_state),
  pruner = optuna.pruners.HyperbandPruner(),
  study_name = "tune_lgbm",
  direction = "minimize"
)

[I 2023-12-01 16:51:40,367] A new study created in memory with name: tune_lgbm


In [10]:
# Perform study
optuna.logging.set_verbosity(optuna.logging.ERROR)
study_lgbm.optimize(
  objective_lgbm, 
  n_trials = 100,
  show_progress_bar = True)

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]The least populated class in y has only 1 members, which is less than n_splits=4.
Best trial: 0. Best value: 11.8018:   1%|▍                                             | 1/100 [00:00<01:33,  1.06it/s]The least populated class in y has only 1 members, which is less than n_splits=4.
Best trial: 0. Best value: 11.8018:   2%|▉                                             | 2/100 [00:01<01:02,  1.56it/s]The least populated class in y has only 1 members, which is less than n_splits=4.
Best trial: 0. Best value: 11.8018:   3%|█▍                                            | 3/100 [00:01<01:01,  1.59it/s]The least populated class in y has only 1 members, which is less than n_splits=4.
Best trial: 0. Best value: 11.8018:   4%|█▊                                            | 4/100 [00:02<01:04,  1.50it/s]The least populated class in y has only 1 members, which is less than n_split

In [11]:
# Save tuning log
trials_lgbm = study_lgbm.trials_dataframe().sort_values("value", ascending = True)
trials_lgbm.to_csv("./OutputData/trials_lgbm.csv", index = False)

## Findings

In [17]:
# Load best 10 tunes
best_tunes = pd.read_csv("./OutputData/trials_lgbm.csv")
best_tunes = best_tunes[best_tunes["state"] == "COMPLETE"].iloc[0:10]

In [18]:
best_tunes

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_l1_reg,params_l2_reg,params_learning_rate,params_min_child_samples,params_min_child_weight,params_num_leaves,params_subsample,user_attrs_n_rounds,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,system_attrs_completed_rung_4,state
2,76,6.956704,2023-12-01 16:52:13.357583,2023-12-01 16:52:16.381625,0 days 00:00:03.024042,0.833854,0.011522,1.250153,0.223374,36,9,738,0.931633,150.5,7.097445,6.707202,6.595486,,,COMPLETE
4,67,7.064818,2023-12-01 16:52:09.886250,2023-12-01 16:52:12.336386,0 days 00:00:02.450136,0.946395,0.00174,1.34799,0.226956,37,14,367,0.942269,138.0,6.372009,,,,,COMPLETE
7,28,7.101079,2023-12-01 16:51:49.753455,2023-12-01 16:51:52.213793,0 days 00:00:02.460338,0.905602,0.00277,1.283254,0.24135,35,2,818,0.898297,139.75,18.069183,8.897525,7.585577,7.113627,,COMPLETE
9,84,7.187282,2023-12-01 16:52:17.225222,2023-12-01 16:52:19.304577,0 days 00:00:02.079355,0.906485,0.001918,0.965197,0.197681,44,14,444,0.875898,141.5,7.316196,,,,,COMPLETE
10,5,7.283542,2023-12-01 16:51:43.203302,2023-12-01 16:51:44.990421,0 days 00:00:01.787119,0.703759,0.060272,1.645326,0.232552,33,7,718,0.77273,112.0,34.952872,19.345492,9.093302,7.112571,7.058473,COMPLETE
12,62,7.329609,2023-12-01 16:52:07.039552,2023-12-01 16:52:09.122735,0 days 00:00:02.083183,0.803194,0.002782,1.098998,0.441094,31,14,892,0.922673,91.5,17.909371,9.801207,7.343833,6.890471,6.808464,COMPLETE
13,30,7.366407,2023-12-01 16:51:52.307301,2023-12-01 16:51:53.864935,0 days 00:00:01.557634,0.923423,0.004569,1.094782,0.243292,51,10,827,0.918955,101.5,17.426777,8.3392,6.879276,6.601409,,COMPLETE
16,13,7.466978,2023-12-01 16:51:45.836028,2023-12-01 16:51:47.258650,0 days 00:00:01.422622,0.840692,0.005705,1.090819,0.237087,44,15,527,0.742251,105.25,7.610802,7.247135,,,,COMPLETE
17,52,7.521713,2023-12-01 16:52:02.725150,2023-12-01 16:52:04.284321,0 days 00:00:01.559171,0.776911,0.002883,1.002725,0.433102,36,12,965,0.922826,52.75,17.608128,9.335379,7.251925,6.841611,6.796933,COMPLETE
21,53,7.624244,2023-12-01 16:52:04.286321,2023-12-01 16:52:05.826932,0 days 00:00:01.540611,0.837377,0.002488,1.1193,0.435544,38,12,859,0.853838,65.25,,,,,,COMPLETE


Most tunes train for 100+ rounds, and the hyperparameter configuration has a strong impact on model performance, even while ignoring country and year grouping.