# LightGBM Hyperparameter Tuning with Optuna

**MedHack Frontiers** â€” Optuna tunes LightGBM on precomputed features.

## Setup (Google Colab)
1. Upload these files to a folder in Google Drive (e.g. `MyDrive/medhack-frontiers/`):
   - `train_features.parquet`, `test_features.parquet`, `holdout_features.parquet`
   - `train_data.csv`, `test_data.csv` (for encounter_ids used in GroupKFold)
   - `sample_submission.csv` (optional, for validation)
2. Update `DRIVE_DATA_DIR` below to match your folder path.
3. Run all cells.

In [None]:
# Install dependencies
!pip install -q lightgbm scikit-learn optuna pandas pyarrow

In [None]:
# Mount Google Drive and set data path
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DATA_DIR = '/content/drive/MyDrive/data'  # Update if your folder is elsewhere

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, classification_report
from pathlib import Path
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path(DRIVE_DATA_DIR)

# Use all features (exclude label)
EXCLUDE_COLS = ["label"]

print("Imports OK")

In [None]:
# Load parquet features from Drive
print("Loading features from Google Drive...")
train = pd.read_parquet(DATA_DIR / "train_features.parquet")
test = pd.read_parquet(DATA_DIR / "test_features.parquet")
holdout = pd.read_parquet(DATA_DIR / "holdout_features.parquet")

combined = pd.concat([train, test], ignore_index=True)
del train, test
gc.collect()

print(f"Combined (train+test): {combined.shape}")
print(f"Holdout: {holdout.shape}")

y = combined["label"].astype(int)
feature_cols = [c for c in combined.columns if c not in EXCLUDE_COLS]
X = combined[feature_cols].copy()

print(f"Features used: {len(feature_cols)}")
print(f"\nLabel distribution:\n{y.value_counts().sort_index()}")

# Class weights (same as medhack_pipeline)
class_counts = y.value_counts().sort_index()
total = len(y)
class_weights = {c: total / (len(class_counts) * count) for c, count in class_counts.items()}
class_weights[2] = class_weights[2] * 3.0
class_weights[3] = class_weights[3] * 2.0
sample_weights = y.map(class_weights).values
print(f"\nClass weights: {class_weights}")

# Load encounter_ids for GroupKFold (prevents leakage across encounters)
print("Loading encounter_ids for GroupKFold...")
train_raw = pd.read_csv(DATA_DIR / "train_data.csv", usecols=["encounter_id"])
test_raw = pd.read_csv(DATA_DIR / "test_data.csv", usecols=["encounter_id"])
encounter_ids = pd.concat([train_raw, test_raw], ignore_index=True)["encounter_id"]
del train_raw, test_raw
gc.collect()

assert len(encounter_ids) == len(X), f"Row mismatch: encounter_ids {len(encounter_ids)} vs X {len(X)}"
print(f"Encounter IDs loaded: {len(encounter_ids)}")

del combined
gc.collect()

## Optuna Hyperparameter Tuning

Tunes main LightGBM hyperparameters using 5-fold GroupKFold CV. Macro F1 is maximized.

In [None]:
def objective(trial):
    params = {
        "objective": "multiclass",
        "num_class": 4,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "seed": 42,
        # Tunable hyperparameters (tuned for ~2.1M rows, 259 features)
        "num_leaves": trial.suggest_int("num_leaves", 63, 511),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.3, 0.8),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
        "bagging_freq": trial.suggest_int("bagging_freq", 3, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 50, 300),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 12),
    }

    gkf = GroupKFold(n_splits=5)
    fold_scores = []
    best_iterations = []

    for train_idx, val_idx in gkf.split(X, y, groups=encounter_ids):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        w_tr = sample_weights[train_idx]

        dtrain = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params, dtrain,
            num_boost_round=2000,
            valid_sets=[dval],
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
        )

        preds = model.predict(X_val).argmax(axis=1)
        macro_f1 = f1_score(y_val, preds, average="macro")
        fold_scores.append(macro_f1)
        best_iterations.append(model.best_iteration)

    mean_f1 = np.mean(fold_scores)
    trial.set_user_attr("fold_scores", fold_scores)
    trial.set_user_attr("best_iterations", best_iterations)
    trial.set_user_attr("mean_best_iteration", int(np.mean(best_iterations)))
    return mean_f1

In [None]:
N_TRIALS = 50  # Increase for more thorough search

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize", study_name="lgbm_medhack")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print(f"\n--- Best Trial ---")
print(f"Mean Macro F1: {study.best_trial.value:.4f}")
print(f"Fold scores: {[f'{s:.4f}' for s in study.best_trial.user_attrs['fold_scores']]}")
print(f"Mean best iteration: {study.best_trial.user_attrs['mean_best_iteration']}")
print(f"\nBest params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")

In [None]:
# Optional: visualization (requires plotly)
try:
    from optuna.visualization import plot_optimization_history, plot_param_importances
    fig1 = plot_optimization_history(study)
    fig1.show()
    fig2 = plot_param_importances(study)
    fig2.show()
except Exception as e:
    print(f"Visualization skipped: {e}")

## Train Final Model and Save

Refits on all train+test data with best params, then saves model and feature columns.

In [None]:
# Build final params from best trial
best_params = study.best_trial.params.copy()
best_params.update({
    "objective": "multiclass",
    "num_class": 4,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "n_jobs": -1,
    "seed": 42,
})

best_iteration = study.best_trial.user_attrs["mean_best_iteration"]
best_iteration = max(best_iteration, 500)  # Ensure minimum rounds

print(f"Training final model ({best_iteration} rounds)...")
dtrain_full = lgb.Dataset(X, label=y, weight=sample_weights)
final_model = lgb.train(best_params, dtrain_full, num_boost_round=best_iteration)

# Save to Drive
out_dir = Path(DRIVE_DATA_DIR)
with open(out_dir / "lgb_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
with open(out_dir / "feature_cols.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

print(f"Saved to {out_dir}: lgb_model.pkl, feature_cols.pkl")

# Sanity check
train_preds = final_model.predict(X).argmax(axis=1)
train_f1 = f1_score(y, train_preds, average="macro")
print(f"\nTrain Macro F1 (sanity): {train_f1:.4f}")
print(classification_report(y, train_preds, target_names=["Normal", "Warning", "Crisis", "Death"]))

## Generate Submission

Predict on holdout and save `submission.csv` to Drive.

In [None]:
print("Loading model and feature columns...")
with open(DATA_DIR / "lgb_model.pkl", "rb") as f:
    model = pickle.load(f)
with open(DATA_DIR / "feature_cols.pkl", "rb") as f:
    feature_cols = pickle.load(f)

for col in feature_cols:
    if col not in holdout.columns:
        holdout[col] = 0

X_holdout = holdout[feature_cols]
print(f"Holdout shape: {X_holdout.shape}")

proba = model.predict(X_holdout)
predictions = proba.argmax(axis=1)

print(f"\nPrediction distribution:")
for label in sorted(np.unique(predictions)):
    count = (predictions == label).sum()
    print(f"  Label {label}: {count:,} ({count/len(predictions)*100:.1f}%)")

submission = pd.DataFrame({
    "ID": np.arange(1, len(predictions) + 1),
    "predicted_label": predictions,
})
sub_path = Path(DRIVE_DATA_DIR) / "submission.csv"
submission.to_csv(sub_path, index=False)
print(f"\nSaved {sub_path}")

# Verify if sample_submission exists
sample_path = DATA_DIR / "sample_submission.csv"
if sample_path.exists():
    sample = pd.read_csv(sample_path)
    assert len(submission) == len(sample), f"Row mismatch: {len(submission)} vs {len(sample)}"
    assert list(submission.columns) == list(sample.columns), "Column mismatch"
    print("Format verified against sample_submission.csv!")

In [None]:
# Download submission to local machine (optional)
from google.colab import files
files.download(str(Path(DRIVE_DATA_DIR) / "submission.csv"))
print("Downloaded submission.csv")