In [1]:
# Cell 1
import os
import numpy as np
import pandas as pd
import mlflow
import optuna
from optuna.integration import MLflowCallback

import sys
import os
sys.path.append(os.path.abspath(".."))
from src.utils import load_json, save_json
from src.metrics import multiclass_and_binary_metrics

mlflow.set_experiment("iml2025_project")
os.makedirs("../models", exist_ok=True)
os.makedirs("../logs/metrics", exist_ok=True)

train = pd.read_csv("../data/train_fe.csv")
y_class4 = train["class4"].values
y_binary = (train["class4"] != "nonevent").astype(int)

class_list = load_json("../models/class_list.json")
alpha = 0.7

  from .autonotebook import tqdm as notebook_tqdm
2025/12/07 13:23:59 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/07 13:23:59 INFO mlflow.store.db.utils: Updating database tables
2025/12/07 13:23:59 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/07 13:23:59 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/07 13:23:59 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/07 13:23:59 INFO alembic.runtime.migration: Will assume non-transactional DDL.


In [2]:
# Cell 2
oof_et = np.load("../models/oof_best_et_multiclass.npy")
oof_xgb = np.load("../models/oof_best_xgb_multiclass.npy")
oof_rf = np.load("../models/oof_best_rf_multiclass.npy")

oof_list = [oof_et, oof_xgb, oof_rf]
model_names = ["extratrees", "xgb", "rf"]
print("OOF shapes:", [o.shape for o in oof_list])


OOF shapes: [(450, 4), (450, 4), (450, 4)]


In [4]:
# Cell 3
def ensemble_obj(trial):
    M = len(oof_list)
    ws = np.array([trial.suggest_float(f"w{i}", 0.0, 1.0) for i in range(M)])
    ws = ws + 1e-12
    ws = ws / ws.sum()
    ens = sum(ws[i] * oof_list[i] for i in range(M))
    metrics = multiclass_and_binary_metrics(y_class4, ens, nonevent_label="nonevent", class_list=class_list)
    trial.set_user_attr("metrics", metrics)
    return alpha * metrics["binary_logloss"] + (1 - alpha) * metrics["multiclass_logloss"]


In [6]:
# Cell 4
study = optuna.create_study(direction="minimize", study_name="ensemble_weights")
mlflow_cb = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="composite_loss")
study.optimize(ensemble_obj, n_trials=300, callbacks=[mlflow_cb])

best_params = study.best_params
best_weights = np.array([best_params[f"w{i}"] for i in range(len(oof_list))])
best_weights = best_weights / best_weights.sum()
print("Best normalized weights:", best_weights)
save_json({"model_names": model_names, "weights": best_weights.tolist()}, "../models/ensemble_weights.json")


[I 2025-12-07 13:27:14,750] A new study created in memory with name: ensemble_weights
  mlflow_cb = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="composite_loss")
[I 2025-12-07 13:27:14,761] Trial 0 finished with value: 0.45372779399243723 and parameters: {'w0': 0.13025045818809733, 'w1': 0.48024038732250096, 'w2': 0.22287699596738175}. Best is trial 0 with value: 0.45372779399243723.
[I 2025-12-07 13:27:14,831] Trial 1 finished with value: 0.454492279972287 and parameters: {'w0': 0.023245082912541237, 'w1': 0.4326830956717531, 'w2': 0.22835779247175392}. Best is trial 0 with value: 0.45372779399243723.
[I 2025-12-07 13:27:14,891] Trial 2 finished with value: 0.4673017272318346 and parameters: {'w0': 0.8968948235459775, 'w1': 0.03573375131825085, 'w2': 0.5697670191621375}. Best is trial 0 with value: 0.45372779399243723.
[I 2025-12-07 13:27:14,950] Trial 3 finished with value: 0.46831228081964893 and parameters: {'w0': 0.4764737303252228, 'w1': 0.05870285555105403

Best normalized weights: [3.46139490e-01 6.53725344e-01 1.35166875e-04]


In [7]:
# Cell 5 - Compute ensemble OOF multiclass probs & metrics, save
ens_oof = sum(best_weights[i] * oof_list[i] for i in range(len(oof_list)))
np.save("../models/oof_ensemble_multiclass_raw.npy", ens_oof)

ens_metrics = multiclass_and_binary_metrics(y_class4, ens_oof, nonevent_label="nonevent", class_list=class_list)
print("Ensemble OOF raw metrics:", ens_metrics)
save_json(ens_metrics, "../models/ensemble_oof_metrics_raw.json")

save_json(ens_metrics, "../logs/metrics/ensemble_oof_metrics_raw.json")


Ensemble OOF raw metrics: {'multiclass_logloss': 0.7631013722708008, 'class4_accuracy': 0.6777777777777778, 'binary_logloss': 0.31911247633914175, 'class2_accuracy': 0.8711111111111111, 'perplexity': 1.3759060730812251}


