In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import optuna

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
X = train_df.drop(columns=['id', 'Tm'])
y = train_df['Tm']

print(f"Data loaded successfully. Using {X.shape[1]} features for tuning")

X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]

Data loaded successfully. Using 322 features for tuning


In [None]:
def objective(trial):
    """Objective func for Optuna to optimise"""
    params = {
        'objective': 'regression_l1', # MAE
        'metric': 'mae',
        'n_estimators': 1000, 
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(
            **params,
        )

        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)], 
                  eval_metric='mae',
                  callbacks=[lgb.early_stopping(50, verbose=False)])
        
        y_pred_log = model.predict(X_val, num_iteration=model.best_iteration_)

        mae = mean_absolute_error(np.exp(y_val), np.exp(y_pred_log))
        mae_scores.append(mae)
    
    return np.mean(mae_scores)


In [4]:
from tqdm import tqdm

print(" Starting Hyperparameter Tuning for LightGBM")
optuna.logging.set_verbosity(optuna.logging.WARNING)

class TqdmCallback:
    def __init__(self, total):
        self.pbar = tqdm(total=total, desc="Optimizing")
    def __call__(self, study, trial):
        self.pbar.update(1)

n_trials = 100
tqdm_callback = TqdmCallback(n_trials)

study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=n_trials, callbacks=[tqdm_callback])

 Starting Hyperparameter Tuning for LightGBM


Optimizing: 100%|██████████| 100/100 [11:51<00:00,  5.70s/it]

In [5]:
print("Tuning Finished")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial MAE: {study.best_value:.5f}")
print("Best hyperparameters found:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")

try:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()

    fig2 = optuna.visualization.plot_param_importances(study)
    fig2.show()
except Exception as e:
    print(f"\nCould not generate plots. Error: {e}")
    print("Install plotly for visualization: pip install plotly")


Tuning Finished
Number of finished trials: 100
Best trial MAE: 27.92456
Best hyperparameters found:
learning_rate: 0.05445820875847297
max_depth: 11
num_leaves: 28
subsample: 0.8096640142442519
colsample_bytree: 0.7222682997643721
lambda_l1: 0.07754417154115359
lambda_l2: 3.330947586954466e-07
