In [2]:
# Cell 1 - Imports & setup
import os
import json
import numpy as np
import pandas as pd
import joblib
import mlflow

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import sys
import os
sys.path.append(os.path.abspath(".."))

from src.metrics import multiclass_and_binary_metrics
from src.utils import save_json, save_df, save_oof

mlflow.set_experiment("iml2025_project")
os.makedirs("../models", exist_ok=True)
os.makedirs("../logs/metrics", exist_ok=True)


2025/12/06 22:00:16 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/06 22:00:16 INFO mlflow.store.db.utils: Updating database tables
2025/12/06 22:00:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/06 22:00:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/06 22:00:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/06 22:00:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.


In [3]:
# Cell 2 - Load data, create class2, save class_list
train = pd.read_csv("../data/train.csv")
train['class2'] = (train['class4'] != 'nonevent').astype(int)

# Ensure stable class ordering
class_list = sorted(train['class4'].unique().tolist())
save_json(class_list, "../models/class_list.json")
print("Class list:", class_list)


Class list: ['II', 'Ia', 'Ib', 'nonevent']


In [5]:
# Cell 3 - Prepare X and y (class4 labels)
# Dropping partlybad as it does not vary meaningfully
drop_cols = ["id", "date", "class2", "partlybad"]  # keep class4
X = train.drop(columns=drop_cols)
y_class4 = X['class4'].values
X = X.drop(columns=['class4'])
feature_cols = X.columns.tolist()
print("X shape:", X.shape)


X shape: (450, 100)


In [6]:
# Cell 4 - CV splitter and helper to get multiclass OOF probs
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def oof_predict_proba_multiclass(estimator, X, y_class4, cv):
    # returns OOF predicted multiclass probabilities (n_samples, n_classes)
    probs = cross_val_predict(estimator, X, y_class4, cv=cv, method="predict_proba")
    return probs


In [8]:
# Cell 5 - Baseline 1: Multinomial Logistic Regression
with mlflow.start_run(run_name="02_baseline_lr_multiclass"):
    lr_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000))
    ])
    oof_lr = oof_predict_proba_multiclass(lr_pipe, X, y_class4, cv)
    np.save("../models/oof_lr_multiclass.npy", oof_lr)
    metrics_lr = multiclass_and_binary_metrics(y_class4, oof_lr, nonevent_label="nonevent", class_list=class_list)
    mlflow.log_params({"model":"LogisticRegression"})
    mlflow.log_metrics(metrics_lr)
    save_df(pd.DataFrame([metrics_lr]).assign(model="LogisticRegression"), "../logs/metrics/baseline_metrics_lr.csv")
    print("LR metrics:", metrics_lr)




LR metrics: {'multiclass_logloss': 0.9140677134712362, 'class4_accuracy': 0.6533333333333333, 'binary_logloss': 0.39484441429646344, 'class2_accuracy': 0.8666666666666667, 'perplexity': 1.4841532599274976}




In [11]:
# Cell 6 - Baseline 2: Random Forest (multiclass)
with mlflow.start_run(run_name="02_baseline_rf_multiclass"):
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    oof_rf = oof_predict_proba_multiclass(rf, X, y_class4, cv)
    np.save("../models/oof_rf_multiclass.npy", oof_rf)
    metrics_rf = multiclass_and_binary_metrics(y_class4, oof_rf, nonevent_label="nonevent", class_list=class_list)
    mlflow.log_params({"model":"RandomForest", "n_estimators":300})
    mlflow.log_metrics(metrics_rf)
    save_df(pd.DataFrame([metrics_rf]).assign(model="RandomForest"), "../logs/metrics/baseline_metrics_rf.csv")
    print("RF metrics:", metrics_rf)


RF metrics: {'multiclass_logloss': 0.7898848328804651, 'class4_accuracy': 0.6622222222222223, 'binary_logloss': 0.3442006316893897, 'class2_accuracy': 0.8533333333333334, 'perplexity': 1.4108616706742998}


## Need to be included in the report

In [None]:
# Cell 7 - Summary of baseline metrics
rows = []
rows.append({"model":"LogisticRegression", **metrics_lr})
rows.append({"model":"RandomForest", **metrics_rf})
df_summary = pd.DataFrame(rows)
save_df(df_summary, "../logs/metrics/baseline_metrics_summary.csv")
df_summary


Unnamed: 0,model,multiclass_logloss,class4_accuracy,binary_logloss,class2_accuracy,perplexity
0,LogisticRegression,0.914068,0.653333,0.394844,0.866667,1.484153
1,RandomForest,0.789885,0.662222,0.344201,0.853333,1.410862
