# LightGBM Hyperparameter Tuning with Optuna

**MedHack Frontiers** — Optuna tunes LightGBM on precomputed features.

## Setup (Google Colab)
1. Upload these files to a folder in Google Drive (e.g. `MyDrive/medhack-frontiers/`):
   - `train_features.parquet`, `test_features.parquet`, `holdout_features.parquet`
   - `train_data.csv`, `test_data.csv` (required for FAR: encounter-level false alarm rate)
   - `sample_submission.csv` (optional, for validation)
2. Update `DRIVE_DATA_DIR` below to match your folder path.
3. Run all cells.

**Note:** Train and test are loaded separately (no concat) to avoid RAM issues. Multi-objective tuning: maximize Mean Macro-AUPRC, minimize False Alarm Rate (FAR = % of healthy encounters with ≥1 alarm at prob > 0.5).

Only exeecute cell below for colab

In [3]:
!pip install -q lightgbm scikit-learn optuna pandas pyarrow "optuna-integration[lightgbm]"


from google.colab import drive
drive.mount('/content/drive')

DRIVE_DATA_DIR = '/content/drive/MyDrive/MedHack 2026/data'  # Update if your folder is elsewhere

ZIP_PATH = '/content/drive/MyDrive/MedHack 2026/data/Archive.zip'
!unzip -q "{ZIP_PATH}" -d /content/dataset
!ls /content/dataset/data

TRAIN_PATH = '/content/dataset/data/train_features.parquet'
TEST_PATH = '/content/dataset/data/test_features.parquet'
HOLDOUT_PATH = '/content/dataset/data/holdout_features.parquet'

print("Loading features from Google Drive...")
train = pd.read_parquet(TRAIN_PATH)
test = pd.read_parquet(TEST_PATH)
holdout = pd.read_parquet(HOLDOUT_PATH)

zsh:1: /usr/local/bin/pip: bad interpreter: /usr/local/opt/python@3.11/bin/python3.11: no such file or directory


ModuleNotFoundError: No module named 'google.colab'

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.pruners import MedianPruner
from sklearn.metrics import f1_score, classification_report, average_precision_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from pathlib import Path
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Load parquet features 
DATA_DIR = Path("../data")
print("Loading features locally...")
train = pd.read_parquet(DATA_DIR / "train_features.parquet")
test = pd.read_parquet(DATA_DIR / "test_features.parquet")
holdout = pd.read_parquet(DATA_DIR / "holdout_features.parquet")

Loading features locally...


In [3]:

# Use all features (exclude label)
EXCLUDE_COLS = ["label"]


def macro_fpr(y_true, y_pred, n_classes=4):
    """Macro average false positive rate (one-vs-rest per class)."""
    cm = confusion_matrix(y_true, y_pred, labels=range(n_classes))
    fprs = []
    for i in range(n_classes):
        fp = cm[:, i].sum() - cm[i, i]
        n_neg = cm.sum() - cm[i, :].sum()
        fprs.append(fp / n_neg if n_neg > 0 else 0)
    return np.mean(fprs)


def false_alarm_rate(y_true, y_prob, encounter_ids, threshold=0.5):
    """
    FAR = % of all-negative (healthy) encounters where predicted prob for
    deterioration (classes 1,2,3) crosses threshold at least once.
    """
    # Prob of any positive/deterioration class (1, 2, 3)
    prob_positive = y_prob[:, 1:].sum(axis=1)
    df = pd.DataFrame({"encounter_id": encounter_ids, "label": y_true, "prob_pos": prob_positive})
    # Healthy encounters: all labels are 0
    healthy = df.groupby("encounter_id").agg({"label": "max", "prob_pos": "max"}).reset_index()
    healthy_encounters = healthy[healthy["label"] == 0]
    n_healthy = len(healthy_encounters)
    if n_healthy == 0:
        return 0.0
    n_false_alarms = (healthy_encounters["prob_pos"] >= threshold).sum()
    return n_false_alarms / n_healthy


print("Imports OK")

Imports OK


In [4]:


feature_cols = [c for c in train.columns if c not in EXCLUDE_COLS]
X_train = train[feature_cols].copy()
y_train = train["label"].astype(int)
X_test = test[feature_cols].copy()
y_test = test["label"].astype(int)

# LightGBM requires numeric features only - convert datetime, drop string/object
def ensure_numeric_features(df, cols):
    """Convert datetime to numeric, drop non-numeric columns. Returns (df, updated_cols)."""
    keep_cols = []
    for col in cols:
        dtype = df[col].dtype
        if pd.api.types.is_numeric_dtype(dtype):
            keep_cols.append(col)
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            keep_cols.append(col)
            df[col] = df[col].astype("int64")  # Unix timestamp
        # else: drop string/object columns
    dropped = set(cols) - set(keep_cols)
    if dropped:
        print(f"Dropped non-numeric columns: {dropped}")
    return df[keep_cols], keep_cols

X_train, feature_cols = ensure_numeric_features(X_train, feature_cols)
X_test = X_test[feature_cols].copy()
# Convert any datetime cols in X_test (should match X_train)
for col in feature_cols:
    if pd.api.types.is_datetime64_any_dtype(X_test[col].dtype):
        X_test[col] = X_test[col].astype("int64")

print(f"Train: {X_train.shape}, Test: {X_test.shape}, Holdout: {holdout.shape}")
print(f"Features used: {len(feature_cols)}")
print(f"\nTrain label distribution:\n{y_train.value_counts().sort_index()}")

# Class weights from train (same as medhack_pipeline)
class_counts = y_train.value_counts().sort_index()
total = len(y_train)
class_weights = {c: total / (len(class_counts) * count) for c, count in class_counts.items()}
class_weights[2] = class_weights[2] * 3.0
class_weights[3] = class_weights[3] * 2.0
sample_weights = y_train.map(class_weights).values
print(f"\nClass weights: {class_weights}")

# Load encounter_ids for FAR (test set only)
print("Loading encounter_ids for FAR...")
test_encounter_ids = pd.read_csv(DATA_DIR / "test_data.csv", usecols=["encounter_id"])["encounter_id"]
assert len(test_encounter_ids) == len(X_test), f"Row mismatch: {len(test_encounter_ids)} vs {len(X_test)}"

# Optuna threshold for FAR
FAR_THRESHOLD = 0.5

# --- Two-phase tuning: stratified sampling for Phase 1 ---
SAMPLE_FRAC_PHASE1 = 0.1   # 10% of encounters for fast Phase 1 exploration
N_TOP_PHASE1 = 5           # Top configs to carry to Phase 2
PHASE2_FULL_DATA = True    # Phase 2 uses full train (False = 50% sample)
RANDOM_STATE = 42

# Encounter-level stratified sample (preserves class distribution per encounter)
encounter_labels = train.groupby("encounter_id")["label"].max()
encounters_phase1 = encounter_labels.groupby(encounter_labels, group_keys=False).apply(
    lambda x: x.sample(frac=SAMPLE_FRAC_PHASE1, random_state=RANDOM_STATE)
).index
phase1_mask = train["encounter_id"].isin(encounters_phase1)

X_tune = X_train[phase1_mask].copy()
y_tune = y_train[phase1_mask].values
w_tune = sample_weights[phase1_mask.values]

# Phase 2 sample (if not full data): encounter-level stratified 50%
if not PHASE2_FULL_DATA:
    encounters_phase2 = encounter_labels.groupby(encounter_labels, group_keys=False).apply(
        lambda x: x.sample(frac=0.5, random_state=RANDOM_STATE + 1)
    ).index
    phase2_mask = train["encounter_id"].isin(encounters_phase2)
else:
    phase2_mask = None  # Use all rows

print(f"\nPhase 1 tune sample: {X_tune.shape[0]:,} rows ({SAMPLE_FRAC_PHASE1*100:.0f}% of train)")
print(f"Phase 2: {'full' if PHASE2_FULL_DATA else '50%'} train data")

del train, test
gc.collect()

Dropped non-numeric columns: {'encounter_id'}
Train: (2109600, 257), Test: (451440, 257), Holdout: (452880, 258)
Features used: 257

Train label distribution:
label
0    1548998
1     359553
2     139107
3      61942
Name: count, dtype: int64

Class weights: {0: 0.34047816717645857, 1: 1.4668213031180382, 2: 11.37397830447066, 3: 17.028833424816764}
Loading encounter_ids for FAR...

Phase 1 tune sample: 211,680 rows (10% of train)
Phase 2: full train data


20

## Optuna Two-Phase Multi-Objective Tuning

**Phase 1:** Fast exploration on 10% stratified sample with MedianPruner + LightGBMPruningCallback.  
**Phase 2:** Top configs re-evaluated on full train data.

**Objectives:** (A) Maximize Mean Macro-AUPRC, (B) Minimize FAR — % of healthy encounters where prob(deterioration) ≥ 0.5.

In [5]:
def objective(trial, X_tr, y_tr, w_tr):
    """Objective for Phase 1: trains on (X_tr, y_tr, w_tr), evaluates on X_test/y_test."""
    params = {
        "objective": "multiclass",
        "num_class": 4,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "seed": 42,
        # Tunable hyperparameters
        "num_leaves": trial.suggest_int("num_leaves", 63, 511),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.3, 0.8),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
        "bagging_freq": trial.suggest_int("bagging_freq", 3, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 50, 300),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 12),
    }

    dtrain = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
    dval = lgb.Dataset(X_test, label=y_test, reference=dtrain)

    # LightGBMPruningCallback does not support multi-objective studies (it needs study.direction).
    # Use only early_stopping; LightGBM will still prune within each trial based on valid loss.
    model = lgb.train(
        params, dtrain,
        num_boost_round=1500,  # Slightly fewer for Phase 1; early stopping handles rest
        valid_sets=[dval],
        valid_names=["valid"],
        callbacks=[
            lgb.early_stopping(50, verbose=False),
            lgb.log_evaluation(0),
        ],
    )

    y_prob = model.predict(X_test)
    y_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
    mean_auprc = average_precision_score(y_bin, y_prob, average="macro")
    far = false_alarm_rate(y_test, y_prob, test_encounter_ids, threshold=FAR_THRESHOLD)

    trial.set_user_attr("mean_auprc", mean_auprc)
    trial.set_user_attr("far", far)
    trial.set_user_attr("best_iteration", model.best_iteration)
    return mean_auprc, far

In [6]:
N_TRIALS_PHASE1 = 60   # More trials OK: Phase 1 is fast (10% data) + pruning
N_TOP_PHASE1 = min(N_TOP_PHASE1, 5)  # Configs to carry to Phase 2
CONFIG_SAVE_PATH = Path(DATA_DIR) / "phase1_best_configs.pkl"

optuna.logging.set_verbosity(optuna.logging.WARNING)
pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=10, interval_steps=1)

# ---------- Phase 1: Fast exploration on stratified sample ----------
print("Phase 1: Exploring hyperparameters on stratified sample (with pruning)...")
study = optuna.create_study(
    directions=["maximize", "minimize"],
    study_name="lgbm_medhack_phase1",
    pruner=pruner,
)
study.optimize(
    lambda t: objective(t, X_tune, y_tune, w_tune),
    n_trials=N_TRIALS_PHASE1,
    show_progress_bar=True,
)

print(f"\n--- Phase 1 Pareto Front ({len(study.best_trials)} trials) ---")
if not study.best_trials:
    completed = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if not completed:
        raise RuntimeError("No completed trials in Phase 1. Check for errors or increase n_startup_trials.")
    # Fallback: use completed trials sorted by first objective
    best_for_phase2 = sorted(completed, key=lambda t: t.values[0] if t.values else 0, reverse=True)[:N_TOP_PHASE1]
else:
    best_for_phase2 = sorted(study.best_trials, key=lambda x: x.values[0], reverse=True)[:N_TOP_PHASE1]

for i, t in enumerate(best_for_phase2):
    print(f"  Trial {t.number}: AUPRC={t.values[0]:.4f}, FAR={t.values[1]:.4f}")

# Save top configs: Pareto trials + their params, sorted by AUPRC (desc)
phase1_configs = []
for t in best_for_phase2:
    phase1_configs.append({
        "params": t.params.copy(),
        "trial_number": t.number,
        "phase1_auprc": t.values[0],
        "phase1_far": t.values[1],
        "best_iteration": t.user_attrs.get("best_iteration", 500),
    })
with open(CONFIG_SAVE_PATH, "wb") as f:
    pickle.dump(phase1_configs, f)
print(f"\nSaved {N_TOP_PHASE1} best configs to {CONFIG_SAVE_PATH}")

# ---------- Phase 2: Refine top configs on full (or 50%) data ----------
if PHASE2_FULL_DATA or phase2_mask is None:
    X_phase2 = X_train
    y_phase2 = y_train.values
    w_phase2 = sample_weights
else:
    X_phase2 = X_train[phase2_mask].copy()
    y_phase2 = y_train[phase2_mask].values
    w_phase2 = sample_weights[phase2_mask.values]
n_phase2 = X_phase2.shape[0]
print(f"\nPhase 2: Evaluating top {N_TOP_PHASE1} configs on {n_phase2:,} rows...")

phase2_results = []
for cfg in phase1_configs:
    params = cfg["params"].copy()
    params.update({
        "objective": "multiclass", "num_class": 4, "metric": "multi_logloss",
        "boosting_type": "gbdt", "verbosity": -1, "n_jobs": -1, "seed": 42,
    })
    n_rounds = max(cfg["best_iteration"], 500)
    dtrain = lgb.Dataset(X_phase2, label=y_phase2, weight=w_phase2)
    dval = lgb.Dataset(X_test, label=y_test, reference=dtrain)
    model = lgb.train(
        params, dtrain,
        num_boost_round=min(n_rounds * 2, 2500),  # Allow some headroom
        valid_sets=[dval],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
    )
    y_prob = model.predict(X_test)
    mean_auprc = average_precision_score(
        label_binarize(y_test, classes=[0, 1, 2, 3]), y_prob, average="macro"
    )
    far = false_alarm_rate(y_test, y_prob, test_encounter_ids, threshold=FAR_THRESHOLD)
    phase2_results.append({
        "trial_number": cfg["trial_number"],
        "params": params,
        "best_iteration": model.best_iteration,
        "auprc": mean_auprc,
        "far": far,
    })
    print(f"  Trial {cfg['trial_number']}: AUPRC={mean_auprc:.4f}, FAR={far:.4f}")

best_phase2 = max(phase2_results, key=lambda r: r["auprc"])
best_trial = type("BestTrial", (), {
    "params": best_phase2["params"],
    "number": best_phase2["trial_number"],
    "user_attrs": {
        "mean_auprc": best_phase2["auprc"],
        "far": best_phase2["far"],
        "best_iteration": best_phase2["best_iteration"],
    },
})()
print(f"\nSelected (Phase 2 best AUPRC): Trial {best_trial.number}")
print(f"  Mean AUPRC: {best_trial.user_attrs['mean_auprc']:.4f}")
print(f"  FAR: {best_trial.user_attrs['far']:.4f}")
print(f"  Best iteration: {best_trial.user_attrs['best_iteration']}")
print(f"\nBest params:")
for k, v in best_trial.params.items():
    if k not in ("objective", "num_class", "metric", "boosting_type", "verbosity", "n_jobs", "seed"):
        print(f"  {k}: {v}")

Phase 1: Exploring hyperparameters on stratified sample (with pruning)...


  0%|          | 0/60 [00:00<?, ?it/s]

[33m[W 2026-02-22 18:24:36,926][0m Trial 0 failed with parameters: {'num_leaves': 220, 'learning_rate': 0.049865595701212934, 'feature_fraction': 0.7157682564435697, 'bagging_fraction': 0.7157440438193362, 'bagging_freq': 3, 'min_child_samples': 296, 'reg_alpha': 5.087945070261666, 'reg_lambda': 0.10026475607295682, 'max_depth': 9} because of the following error: RuntimeError('A single direction cannot be retrieved from a multi-objective study. Consider using Study.directions to retrieve a list containing all directions.').[0m
Traceback (most recent call last):
  File "/Users/jackshee/Projects/Medhack/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/xj/ny4zmnqd609bv2rpjzzr11rm0000gn/T/ipykernel_16049/717579126.py", line 16, in <lambda>
    lambda t: objective(t, X_tune, y_tune, w_tune),
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/

RuntimeError: A single direction cannot be retrieved from a multi-objective study. Consider using Study.directions to retrieve a list containing all directions.

In [None]:
# Optional: visualization (requires plotly)
try:
    from optuna.visualization import plot_pareto_front, plot_param_importances
    fig1 = plot_pareto_front(study, target_names=["AUPRC", "FAR"])
    fig1.show()
    fig2 = plot_param_importances(study, target=lambda t: t.values[0])  # by AUPRC
    fig2.show()
except Exception as e:
    print(f"Visualization skipped: {e}")

## Train Final Model and Save

Trains on train set with best params, then saves model and feature columns.

In [None]:
# Build final params from selected Pareto-optimal trial
best_trial = max(study.best_trials, key=lambda t: t.values[0])
best_params = best_trial.params.copy()
best_params.update({
    "objective": "multiclass",
    "num_class": 4,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "n_jobs": -1,
    "seed": 42,
})

best_iteration = best_trial.user_attrs["best_iteration"]
best_iteration = max(best_iteration, 500)  # Ensure minimum rounds

print(f"Training final model on train ({best_iteration} rounds)...")
dtrain_full = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
final_model = lgb.train(best_params, dtrain_full, num_boost_round=best_iteration)

# Save to Drive
out_dir = Path(DRIVE_DATA_DIR)
with open(out_dir / "lgb_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
with open(out_dir / "feature_cols.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

print(f"Saved to {out_dir}: lgb_model.pkl, feature_cols.pkl")

# Evaluate on test
test_preds = final_model.predict(X_test).argmax(axis=1)
test_f1 = f1_score(y_test, test_preds, average="macro")
test_auprc = average_precision_score(label_binarize(y_test, classes=[0,1,2,3]), final_model.predict(X_test), average="macro")
test_far = false_alarm_rate(y_test, final_model.predict(X_test), test_encounter_ids, threshold=FAR_THRESHOLD)
print(f"\nTest: Macro F1={test_f1:.4f}, AUPRC={test_auprc:.4f}, FAR={test_far:.4f}")
print(classification_report(y_test, test_preds, target_names=["Normal", "Warning", "Crisis", "Death"]))

## Generate Submission

Predict on holdout and save `submission.csv` to Drive.

In [None]:
print("Loading model and feature columns...")
with open(DATA_DIR / "lgb_model.pkl", "rb") as f:
    model = pickle.load(f)
with open(DATA_DIR / "feature_cols.pkl", "rb") as f:
    feature_cols = pickle.load(f)

for col in feature_cols:
    if col not in holdout.columns:
        holdout[col] = 0

X_holdout = holdout[feature_cols].copy()
# Convert datetime to numeric (same as train/test)
for col in feature_cols:
    if pd.api.types.is_datetime64_any_dtype(X_holdout[col].dtype):
        X_holdout[col] = X_holdout[col].astype("int64")
print(f"Holdout shape: {X_holdout.shape}")

proba = model.predict(X_holdout)
predictions = proba.argmax(axis=1)

print(f"\nPrediction distribution:")
for label in sorted(np.unique(predictions)):
    count = (predictions == label).sum()
    print(f"  Label {label}: {count:,} ({count/len(predictions)*100:.1f}%)")

submission = pd.DataFrame({
    "ID": np.arange(1, len(predictions) + 1),
    "predicted_label": predictions,
})
sub_path = Path(DRIVE_DATA_DIR) / "submission.csv"
submission.to_csv(sub_path, index=False)
print(f"\nSaved {sub_path}")

# Verify if sample_submission exists
sample_path = DATA_DIR / "sample_submission.csv"
if sample_path.exists():
    sample = pd.read_csv(sample_path)
    assert len(submission) == len(sample), f"Row mismatch: {len(submission)} vs {len(sample)}"
    assert list(submission.columns) == list(sample.columns), "Column mismatch"
    print("Format verified against sample_submission.csv!")

In [None]:
# Download submission to local machine (optional)
from google.colab import files
files.download(str(Path(DRIVE_DATA_DIR) / "submission.csv"))
print("Downloaded submission.csv")