# LightGBM Hyperparameter Tuning with Optuna

**MedHack Frontiers** — Optuna tunes LightGBM on precomputed features.

## Setup (Google Colab)
1. Upload these files to a folder in Google Drive (e.g. `MyDrive/medhack-frontiers/`):
   - `train_features.parquet`, `test_features.parquet`, `holdout_features.parquet`
   - `train_data.csv`, `test_data.csv` (required for FAR: encounter-level false alarm rate)
   - `sample_submission.csv` (optional, for validation)
2. Update `DRIVE_DATA_DIR` below to match your folder path.
3. Run all cells.

**Note:** Train and test are loaded separately (no concat) to avoid RAM issues. Multi-objective tuning: maximize Mean Macro-AUPRC, minimize False Alarm Rate (FAR = % of healthy encounters with ≥1 alarm at prob > 0.5).

In [None]:
# Install dependencies
!pip install -q lightgbm scikit-learn optuna pandas pyarrow

In [None]:
# Mount Google Drive and set data path
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DATA_DIR = '/content/drive/MyDrive/data'  # Update if your folder is elsewhere

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import f1_score, classification_report, average_precision_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from pathlib import Path
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path(DRIVE_DATA_DIR)

# Use all features (exclude label)
EXCLUDE_COLS = ["label"]


def macro_fpr(y_true, y_pred, n_classes=4):
    """Macro average false positive rate (one-vs-rest per class)."""
    cm = confusion_matrix(y_true, y_pred, labels=range(n_classes))
    fprs = []
    for i in range(n_classes):
        fp = cm[:, i].sum() - cm[i, i]
        n_neg = cm.sum() - cm[i, :].sum()
        fprs.append(fp / n_neg if n_neg > 0 else 0)
    return np.mean(fprs)


def false_alarm_rate(y_true, y_prob, encounter_ids, threshold=0.5):
    """
    FAR = % of all-negative (healthy) encounters where predicted prob for
    deterioration (classes 1,2,3) crosses threshold at least once.
    """
    # Prob of any positive/deterioration class (1, 2, 3)
    prob_positive = y_prob[:, 1:].sum(axis=1)
    df = pd.DataFrame({"encounter_id": encounter_ids, "label": y_true, "prob_pos": prob_positive})
    # Healthy encounters: all labels are 0
    healthy = df.groupby("encounter_id").agg({"label": "max", "prob_pos": "max"}).reset_index()
    healthy_encounters = healthy[healthy["label"] == 0]
    n_healthy = len(healthy_encounters)
    if n_healthy == 0:
        return 0.0
    n_false_alarms = (healthy_encounters["prob_pos"] >= threshold).sum()
    return n_false_alarms / n_healthy


print("Imports OK")

In [None]:
# Load parquet features from Drive (train and test separately to avoid RAM issues)
print("Loading features from Google Drive...")
train = pd.read_parquet(DATA_DIR / "train_features.parquet")
test = pd.read_parquet(DATA_DIR / "test_features.parquet")
holdout = pd.read_parquet(DATA_DIR / "holdout_features.parquet")

feature_cols = [c for c in train.columns if c not in EXCLUDE_COLS]
X_train = train[feature_cols].copy()
y_train = train["label"].astype(int)
X_test = test[feature_cols].copy()
y_test = test["label"].astype(int)

print(f"Train: {X_train.shape}, Test: {X_test.shape}, Holdout: {holdout.shape}")
print(f"Features used: {len(feature_cols)}")
print(f"\nTrain label distribution:\n{y_train.value_counts().sort_index()}")

# Class weights from train (same as medhack_pipeline)
class_counts = y_train.value_counts().sort_index()
total = len(y_train)
class_weights = {c: total / (len(class_counts) * count) for c, count in class_counts.items()}
class_weights[2] = class_weights[2] * 3.0
class_weights[3] = class_weights[3] * 2.0
sample_weights = y_train.map(class_weights).values
print(f"\nClass weights: {class_weights}")

# Load encounter_ids for FAR (test set only)
print("Loading encounter_ids for FAR...")
test_encounter_ids = pd.read_csv(DATA_DIR / "test_data.csv", usecols=["encounter_id"])["encounter_id"]
assert len(test_encounter_ids) == len(X_test), f"Row mismatch: {len(test_encounter_ids)} vs {len(X_test)}"

# Optuna threshold for FAR
FAR_THRESHOLD = 0.5

del train, test
gc.collect()

## Optuna Multi-Objective Hyperparameter Tuning

**Objective A (Maximize):** Mean Macro-AUPRC  
**Objective B (Minimize):** False Alarm Rate — % of healthy (all-negative) encounters where prob(deterioration) ≥ 0.5 at least once.

In [None]:
def objective(trial):
    params = {
        "objective": "multiclass",
        "num_class": 4,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "seed": 42,
        # Tunable hyperparameters (tuned for ~2.1M rows, 259 features)
        "num_leaves": trial.suggest_int("num_leaves", 63, 511),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.3, 0.8),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
        "bagging_freq": trial.suggest_int("bagging_freq", 3, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 50, 300),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 12),
    }

    dtrain = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
    dval = lgb.Dataset(X_test, label=y_test, reference=dtrain)

    model = lgb.train(
        params, dtrain,
        num_boost_round=2000,
        valid_sets=[dval],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
    )

    y_prob = model.predict(X_test)
    y_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
    mean_auprc = average_precision_score(y_bin, y_prob, average="macro")
    far = false_alarm_rate(y_test, y_prob, test_encounter_ids, threshold=FAR_THRESHOLD)

    trial.set_user_attr("mean_auprc", mean_auprc)
    trial.set_user_attr("far", far)
    trial.set_user_attr("best_iteration", model.best_iteration)
    # Multi-objective: (maximize AUPRC, minimize FAR)
    return mean_auprc, far

In [None]:
N_TRIALS = 50  # Increase for more thorough search

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(
    directions=["maximize", "minimize"],  # max AUPRC, min FAR
    study_name="lgbm_medhack_multi",
)
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print(f"\n--- Pareto Front ({len(study.best_trials)} trials) ---")
for i, t in enumerate(study.best_trials):
    print(f"  Trial {t.number}: AUPRC={t.values[0]:.4f}, FAR={t.values[1]:.4f}")
# Select trial with highest AUPRC from Pareto front (or pick by preference)
best_trial = max(study.best_trials, key=lambda t: t.values[0])
print(f"\nSelected (highest AUPRC): Trial {best_trial.number}")
print(f"  Mean AUPRC: {best_trial.user_attrs['mean_auprc']:.4f}")
print(f"  FAR: {best_trial.user_attrs['far']:.4f}")
print(f"  Best iteration: {best_trial.user_attrs['best_iteration']}")
print(f"\nBest params:")
for k, v in best_trial.params.items():
    print(f"  {k}: {v}")

In [None]:
# Optional: visualization (requires plotly)
try:
    from optuna.visualization import plot_pareto_front, plot_param_importances
    fig1 = plot_pareto_front(study, target_names=["AUPRC", "FAR"])
    fig1.show()
    fig2 = plot_param_importances(study, target=lambda t: t.values[0])  # by AUPRC
    fig2.show()
except Exception as e:
    print(f"Visualization skipped: {e}")

## Train Final Model and Save

Trains on train set with best params, then saves model and feature columns.

In [None]:
# Build final params from selected Pareto-optimal trial
best_trial = max(study.best_trials, key=lambda t: t.values[0])
best_params = best_trial.params.copy()
best_params.update({
    "objective": "multiclass",
    "num_class": 4,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "n_jobs": -1,
    "seed": 42,
})

best_iteration = best_trial.user_attrs["best_iteration"]
best_iteration = max(best_iteration, 500)  # Ensure minimum rounds

print(f"Training final model on train ({best_iteration} rounds)...")
dtrain_full = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
final_model = lgb.train(best_params, dtrain_full, num_boost_round=best_iteration)

# Save to Drive
out_dir = Path(DRIVE_DATA_DIR)
with open(out_dir / "lgb_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
with open(out_dir / "feature_cols.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

print(f"Saved to {out_dir}: lgb_model.pkl, feature_cols.pkl")

# Evaluate on test
test_preds = final_model.predict(X_test).argmax(axis=1)
test_f1 = f1_score(y_test, test_preds, average="macro")
test_auprc = average_precision_score(label_binarize(y_test, classes=[0,1,2,3]), final_model.predict(X_test), average="macro")
test_far = false_alarm_rate(y_test, final_model.predict(X_test), test_encounter_ids, threshold=FAR_THRESHOLD)
print(f"\nTest: Macro F1={test_f1:.4f}, AUPRC={test_auprc:.4f}, FAR={test_far:.4f}")
print(classification_report(y_test, test_preds, target_names=["Normal", "Warning", "Crisis", "Death"]))

## Generate Submission

Predict on holdout and save `submission.csv` to Drive.

In [None]:
print("Loading model and feature columns...")
with open(DATA_DIR / "lgb_model.pkl", "rb") as f:
    model = pickle.load(f)
with open(DATA_DIR / "feature_cols.pkl", "rb") as f:
    feature_cols = pickle.load(f)

for col in feature_cols:
    if col not in holdout.columns:
        holdout[col] = 0

X_holdout = holdout[feature_cols]
print(f"Holdout shape: {X_holdout.shape}")

proba = model.predict(X_holdout)
predictions = proba.argmax(axis=1)

print(f"\nPrediction distribution:")
for label in sorted(np.unique(predictions)):
    count = (predictions == label).sum()
    print(f"  Label {label}: {count:,} ({count/len(predictions)*100:.1f}%)")

submission = pd.DataFrame({
    "ID": np.arange(1, len(predictions) + 1),
    "predicted_label": predictions,
})
sub_path = Path(DRIVE_DATA_DIR) / "submission.csv"
submission.to_csv(sub_path, index=False)
print(f"\nSaved {sub_path}")

# Verify if sample_submission exists
sample_path = DATA_DIR / "sample_submission.csv"
if sample_path.exists():
    sample = pd.read_csv(sample_path)
    assert len(submission) == len(sample), f"Row mismatch: {len(submission)} vs {len(sample)}"
    assert list(submission.columns) == list(sample.columns), "Column mismatch"
    print("Format verified against sample_submission.csv!")

In [None]:
# Download submission to local machine (optional)
from google.colab import files
files.download(str(Path(DRIVE_DATA_DIR) / "submission.csv"))
print("Downloaded submission.csv")