In [1]:
# ──────────────────────────────────────────────────────────────────────
# Imports
# ──────────────────────────────────────────────────────────────────────
import numpy as np                
import pandas as pd              
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
from scikeras.wrappers import KerasRegressor
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers
import optuna
import mlflow
import mlflow.sklearn

# ──────────────────────────────────────────────────────────────────────
# Metric helpers
# ──────────────────────────────────────────────────────────────────────
def rmsle_np(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_np, greater_is_better=False)

def rmsle_tf(y_true, y_pred):
    return keras.backend.sqrt(
        keras.losses.mean_squared_logarithmic_error(y_true, y_pred)
    )

# ──────────────────────────────────────────────────────────────────────
# Load data + feature-engineering transformer
# ──────────────────────────────────────────────────────────────────────
df = pd.read_csv("playground-series-s5e5/train.csv")
y  = df["Calories"]
X  = df.drop(columns=["Calories", "id"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=X["Sex"]
)

def add_features(Xdf: pd.DataFrame) -> pd.DataFrame:
    X = Xdf.copy()
    X["BMI"] = (X["Weight"] / (X["Height"] / 100.0) ** 2).round(2)
    X["Timed_Intensity"] = X["Duration"] * X["Heart_Rate"]
    X["Heart_Rate_Zone"] = X["Heart_Rate"] / (220 - X["Age"]) * 100
    X["Mifflin_Jeor_BMR"] = np.where(
        X["Sex"] == "male",
        (10 * X["Weight"]) + (6.25 * X["Height"]) - (5 * X["Age"]) + 5,
        (10 * X["Weight"]) + (6.25 * X["Height"]) - (5 * X["Age"]) - 161,
    )
    return X

feat_eng = FunctionTransformer(add_features, validate=False)

# ──────────────────────────────────────────────────────────────────────
# Pre-processor for the NN
# • numerics → StandardScaler
# • Sex → One-hot (drop first)
# We discover numeric columns **after** feat_eng at fit-time.
# ──────────────────────────────────────────────────────────────────────
def num_selector(df):
    return df.drop(columns=["Sex"]).select_dtypes(include="number").columns

preprocess_nn = ColumnTransformer(
    [
        ("num", StandardScaler(), num_selector),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), ["Sex"]),
    ],
    remainder="drop",
)

# ──────────────────────────────────────────────────────────────────────
# Keras model factory  (SciKeras passes input_dim automatically)
# ──────────────────────────────────────────────────────────────────────
def build_nn(
    lr: float = 1e-3,
    n1: int = 256,
    n2: int = 128,
    n3: int = 0,                 # 0 ⇢ skip third layer
    dropout: float = 0.25,
    activation: str = "relu",
    kernel_regularizer=None,
    *, meta
) -> keras.Model:

    n_features = meta["n_features_in_"]

    inp = keras.Input(shape=(n_features,))
    x = layers.Dense(n1, activation=activation,
                     kernel_regularizer=kernel_regularizer)(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Dense(n2, activation=activation,
                     kernel_regularizer=kernel_regularizer)(x)

    if n3 > 0:                                    # optional 3rd hidden layer
        x = layers.Dense(n3, activation=activation,
                         kernel_regularizer=kernel_regularizer)(x)

    out = layers.Dense(1, kernel_regularizer=kernel_regularizer)(x)

    model = keras.Model(inp, out)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mean_squared_logarithmic_error",
        metrics=[rmsle_tf],
    )
    return model

# ──────────────────────────────────────────────────────────────────────
# 1.  Build the (parameterised) pipeline
# ──────────────────────────────────────────────────────────────────────
early_stop = callbacks.EarlyStopping(
    monitor="val_rmsle_tf",  
    mode="min",              
    patience=15,             
    restore_best_weights=True
)

nn_reg = KerasRegressor(
    model=build_nn,
    lr=1e-3, n1=256, n2=128, n3=0, activation="relu",
    dropout=0.25,
    epochs=60, batch_size=256,
    verbose=2, validation_split=0.1,
    callbacks=[early_stop],
    kernel_regularizer=None,
    random_state=42
)

pipe_nn = Pipeline([
    ("feat_eng", feat_eng),
    ("preprocess", preprocess_nn),
    ("model", nn_reg),
])

from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold

bins = pd.qcut(y, 4, labels=False)                      # stratify target
skf  = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-NN-Optuna-v4")

with mlflow.start_run(run_name="optuna_search"):

    def objective(trial):
        # ── hyper-param suggestions ──────────────────────────────
        l2_val = trial.suggest_float("l2", 1e-7, 1e-4, log=True)

        params = dict(
            model__lr      = trial.suggest_float("lr", 5e-4, 3e-3, log=True),
            model__n1      = trial.suggest_int("n1", 128, 320, step=32),
            model__n2      = trial.suggest_int("n2", 32, 160, step=32),
            model__n3      = trial.suggest_int("n3", 0, 128, step=32),
            model__dropout = trial.suggest_float("dropout", 0.10, 0.45),
            model__activation = trial.suggest_categorical(
                                  "activation", ["relu", "gelu", "selu"]),
            model__epochs  = trial.suggest_int("epochs", 40, 90),
            model__batch_size = trial.suggest_categorical(
                                  "batch_size", [128, 256, 512]),
            model__kernel_regularizer = regularizers.l2(l2_val),
            model__verbose = 0                       # keep CV quiet
        )

        pipe = clone(pipe_nn).set_params(**params)

        # ── nested MLflow run for the trial ──────────────────────
        with mlflow.start_run(nested=True):
            cv_score = cross_val_score(
                pipe, X, y,
                scoring=rmsle_scorer,
                cv=skf.split(X, bins),
                n_jobs=1
            ).mean()

            mlflow.log_metric("cv_rmsle", -cv_score)
            mlflow.log_param("l2", l2_val)
            mlflow.log_params({k: v for k, v in params.items()
                               if k != "model__kernel_regularizer"})
        return cv_score                                  # negative RMSLE

    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(multivariate=True, group=True),
        pruner=MedianPruner(n_warmup_steps=6)
    )
    study.optimize(objective, n_trials=150, show_progress_bar=True)

    mlflow.log_metric("best_cv_rmsle", -study.best_value)

print("Best 5-fold RMSLE :", -study.best_value)




2025/05/27 22:09:28 INFO mlflow.tracking.fluent: Experiment with name 'Calories-NN-Optuna-v4' does not exist. Creating a new experiment.
[I 2025-05-27 22:09:28,683] A new study created in memory with name: no-name-3ffe766e-49b7-48e0-9e5e-98a88ad9ec73


  0%|          | 0/150 [00:00<?, ?it/s]




🏃 View run thundering-mare-443 at: http://127.0.0.1:5000/#/experiments/25/runs/80ac197393a749bf97cc8d59f135faa9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/25
[I 2025-05-27 22:17:16,515] Trial 0 finished with value: -0.0639748131942595 and parameters: {'l2': 2.3726232728117057e-07, 'lr': 0.001842791573029376, 'n1': 192, 'n2': 160, 'n3': 32, 'dropout': 0.26775375550656966, 'activation': 'relu', 'epochs': 90, 'batch_size': 512}. Best is trial 0 with value: -0.0639748131942595.
🏃 View run painted-deer-413 at: http://127.0.0.1:5000/#/experiments/25/runs/0cf82838de4e47bb9c4ec889a5e17888
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/25
[I 2025-05-27 22:27:01,075] Trial 1 finished with value: -0.06851461780228991 and parameters: {'l2': 7.980785942358861e-07, 'lr': 0.0024997042758209257, 'n1': 192, 'n2': 96, 'n3': 96, 'dropout': 0.30632479142455415, 'activation': 'selu', 'epochs': 53, 'batch_size': 256}. Best is trial 0 with value: -0.0639748131942595.
🏃 View run b

In [10]:
best_params = study.best_trial.params.copy()      # ← raw Optuna dict

# 1️⃣ rebuild the kernel_regularizer
l2_coeff = best_params.pop("l2")
best_params["model__kernel_regularizer"] = regularizers.l2(l2_coeff)

# 2️⃣ add "model__" prefix to every remaining NN hyper-param
prefixed = {}
for k, v in best_params.items():
    if k in {"lr", "n1", "n2", "n3", "dropout",
             "activation", "epochs", "batch_size", "verbose"}:
        prefixed[f"model__{k}"] = v
    else:
        prefixed[k] = v          # already correctly scoped (e.g. kernel_regularizer)

best_params = prefixed           # now all keys are pipeline-legal

In [11]:
# ──────────────────────────────────────────────────────────────────────
#  OOF PREDICTIONS & VAL SCORE — Neural Net
# ──────────────────────────────────────────────────────────────────────
from pathlib import Path
from tqdm.notebook import tqdm
from IPython.display import display, clear_output

FE_VERSION = "v5_hrzone_bmr"      # just a tag
EXPERIMENT = "Calories-NN-OOF-VAL-V2"

# 1. Re-use the pipeline definition from above
base_pipe = pipe_nn               # the NN pipeline with early_stop callback

from tensorflow.keras import backend as K

bins = pd.qcut(y, 4, labels=False)                 # same binning as tuning
skf  = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_nn, fold_table = np.empty(len(X)), []

# 2. MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT)

with mlflow.start_run(run_name="nn_oof_fit", tags={"fe_version": FE_VERSION}):
    for fold, (tr_idx, val_idx) in enumerate(tqdm(skf.split(X, bins), total=5), 1):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        fold_model = clone(base_pipe).set_params(**best_params,
                                                 model__verbose=0)

        fold_model.fit(X_tr, y_tr)
        preds = fold_model.predict(X_val)
        oof_nn[val_idx] = np.maximum(preds, 0)      # clip negatives

        fold_rmsle = rmsle_np(y_val, preds)
        mlflow.log_metric(f"fold{fold}_rmsle", fold_rmsle)

        fold_table.append({"fold": fold, "rmsle": fold_rmsle})
        clear_output(wait=True)
        display(pd.DataFrame(fold_table))

        K.clear_session()                           # free memory each fold

    cv_rmsle = np.mean([row["rmsle"] for row in fold_table])
    mlflow.log_metric("cv_rmsle", cv_rmsle)
    print(f"5-fold CV RMSLE: {cv_rmsle:.5f}")

    Path("oof").mkdir(exist_ok=True)
    np.save("oof/oof_nn.npy", oof_nn)
    mlflow.log_artifact("oof/oof_nn.npy", artifact_path="oof")

Unnamed: 0,fold,rmsle
0,1,0.060875
1,2,0.061175
2,3,0.060545
3,4,0.060204
4,5,0.060441


5-fold CV RMSLE: 0.06065
🏃 View run nn_oof_fit at: http://127.0.0.1:5000/#/experiments/26/runs/55c9a2edbeaa4af48b9c8f22616fdbc8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/26


In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1.  Load the Kaggle test set
# ──────────────────────────────────────────────────────────────────────
df_test = pd.read_csv("playground-series-s5e5/test.csv")
ids = df_test["id"]
X_test = df_test.drop(columns=["id"])

# ──────────────────────────────────────────────────────────────────────
# 2a.  If the pipeline is still in memory
# ──────────────────────────────────────────────────────────────────────
test_preds = pipe_nn.predict(X_test)

# ---------- OR ---------- (uncomment if you need to reload) ----------
# from mlflow import sklearn as mlflow_sklearn
# pipe_nn = mlflow_sklearn.load_model("runs:/<RUN_ID>/model")
# test_preds = pipe_nn.predict(X_test)
# ---------------------------------------------------------------------

# Ensure no negative calories (good practice for RMSLE submissions)
test_preds = np.maximum(test_preds, 0)

# ──────────────────────────────────────────────────────────────────────
# 3.  Build and save the submission file
# ──────────────────────────────────────────────────────────────────────
submission = pd.DataFrame({"id": ids, "Calories": test_preds})
submission.to_csv("submission_nn.csv", index=False)

print("✅ submission_nn.csv written (shape:", submission.shape, ")")

In [None]:
# ──────────────────────────────────────────────────────────────────────
# 8.  Fit best model on FULL training data, save to MLflow
# ──────────────────────────────────────────────────────────────────────
#   (here we simply use the default hyper-params; plug study.best_params instead)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-NN-Final")

with mlflow.start_run(run_name="nn_full_fit"):
    pipe_nn.fit(X, y)
    mlflow.sklearn.log_model(pipe_nn, artifact_path="model")
    mlflow.log_metric("full_train_rmsle", rmsle_np(y, pipe_nn.predict(X)))
    print("Model logged to MLflow ✅")