In [1]:
# ──────────────────────────────────────────────────────────────────────
# Imports
# ──────────────────────────────────────────────────────────────────────
import numpy as np                
import pandas as pd              
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
from scikeras.wrappers import KerasRegressor
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers
import optuna
import mlflow
import mlflow.sklearn

# ──────────────────────────────────────────────────────────────────────
# Metric helpers
# ──────────────────────────────────────────────────────────────────────
def rmsle_np(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_np, greater_is_better=False)

def rmsle_tf(y_true, y_pred):
    return keras.backend.sqrt(
        keras.losses.mean_squared_logarithmic_error(y_true, y_pred)
    )

# ──────────────────────────────────────────────────────────────────────
# Load data + feature-engineering transformer
# ──────────────────────────────────────────────────────────────────────
df = pd.read_csv("playground-series-s5e5/train.csv")
y  = df["Calories"]
X  = df.drop(columns=["Calories", "id"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=X["Sex"]
)

def add_features(Xdf: pd.DataFrame) -> pd.DataFrame:
    X = Xdf.copy()
    X["BMI"] = (X["Weight"] / (X["Height"] / 100.0) ** 2).round(2)
    X["Timed_Intensity"] = X["Duration"] * X["Heart_Rate"]
    X["Heart_Rate_Zone"] = X["Heart_Rate"] / (220 - X["Age"]) * 100
    X["Mifflin_Jeor_BMR"] = np.where(
        X["Sex"] == "male",
        (10 * X["Weight"]) + (6.25 * X["Height"]) - (5 * X["Age"]) + 5,
        (10 * X["Weight"]) + (6.25 * X["Height"]) - (5 * X["Age"]) - 161,
    )
    return X

feat_eng = FunctionTransformer(add_features, validate=False)

# ──────────────────────────────────────────────────────────────────────
# Pre-processor for the NN
# • numerics → StandardScaler
# • Sex → One-hot (drop first)
# We discover numeric columns **after** feat_eng at fit-time.
# ──────────────────────────────────────────────────────────────────────
def num_selector(df):
    return df.drop(columns=["Sex"]).select_dtypes(include="number").columns

preprocess_nn = ColumnTransformer(
    [
        ("num", StandardScaler(), num_selector),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), ["Sex"]),
    ],
    remainder="drop",
)

# ──────────────────────────────────────────────────────────────────────
# Keras model factory  (SciKeras passes input_dim automatically)
# ──────────────────────────────────────────────────────────────────────
def build_nn(
    lr: float = 1e-3,
    n1: int = 128,
    n2: int = 64,
    dropout: float = 0.20,
    kernel_regularizer=None,
    *,                         
    meta: dict                 
) -> keras.Model:
    """
    Parameters
    ----------
    lr : learning-rate for Adam
    n1, n2 : units in the first / second hidden layer
    dropout : dropout ratio applied after the first hidden layer
    meta : dict provided by SciKeras that contains run-time information,
           most importantly `meta["n_features_in_"]` → the number of
           columns seen **after** the preprocessing pipeline.
    """
    n_features = meta["n_features_in_"]         

    inputs = keras.Input(shape=(n_features,))
    x = layers.Dense(n1, activation="relu",
                     kernel_regularizer=kernel_regularizer)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(n2, activation="relu",
                     kernel_regularizer=kernel_regularizer)(x)
    outputs = layers.Dense(1,
                           kernel_regularizer=kernel_regularizer)(x)

    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mean_squared_logarithmic_error",   
        metrics=[rmsle_tf],                      
    )
    return model

# ──────────────────────────────────────────────────────────────────────
# 1.  Build the (parameterised) pipeline
# ──────────────────────────────────────────────────────────────────────
early_stop = callbacks.EarlyStopping(
    monitor="val_rmsle_tf",  
    mode="min",              
    patience=15,             
    restore_best_weights=True
)

nn_reg = KerasRegressor(
    model = build_nn,
    lr = 1e-3,          
    n1 = 128,
    n2 = 64,
    dropout = 0.2,
    epochs = 50,
    batch_size = 256,
    verbose = 2,
    validation_split = 0.1,
    callbacks = [early_stop],
    kernel_regularizer=None,
    random_state=42
)

pipe_nn = Pipeline([
    ("feat_eng", feat_eng),
    ("preprocess", preprocess_nn),
    ("model", nn_reg),
])




In [None]:
# ──────────────────────────────────────────────────────────────────────
#  OOF PREDICTIONS & VAL SCORE — Neural Net
# ──────────────────────────────────────────────────────────────────────
from pathlib import Path
from tqdm.notebook import tqdm
from IPython.display import display, clear_output

FE_VERSION = "v4_hrzone_bmr"      # just a tag
EXPERIMENT = "Calories-NN-OOF-VAL"

# 1. Re-use the pipeline definition from above
base_pipe = pipe_nn               # the NN pipeline with early_stop callback

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_nn = np.empty(len(X))
fold_table = []

# 2. MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT)

with mlflow.start_run(run_name="nn_oof_fit",
                      tags={"fe_version": FE_VERSION}):

    for fold, (tr_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=kf.get_n_splits()), 1):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        # 3. Fresh clone – silence epoch logs for CV
        fold_model = clone(base_pipe).set_params(model__verbose=0)

        fold_model.fit(X_tr, y_tr)
        preds = fold_model.predict(X_val)
        oof_nn[val_idx] = preds

        fold_rmsle = rmsle_np(y_val, preds)
        mlflow.log_metric(f"fold{fold}_rmsle", fold_rmsle)

        fold_table.append({"fold": fold, "rmsle": fold_rmsle})
        clear_output(wait=True)
        display(pd.DataFrame(fold_table))

    cv_rmsle = np.mean([row["rmsle"] for row in fold_table])
    mlflow.log_metric("cv_rmsle", cv_rmsle)
    print(f"5-fold CV RMSLE: {cv_rmsle:.5f}")

    # 4. Save + log the OOF vector
    Path("oof").mkdir(exist_ok=True)
    np.save("oof/oof_nn.npy", oof_nn)
    mlflow.log_artifact("oof/oof_nn.npy", artifact_path="oof")

In [None]:
# ──────────────────────────────────────────────────────────────────────
# 1.  Load the Kaggle test set
# ──────────────────────────────────────────────────────────────────────
df_test = pd.read_csv("playground-series-s5e5/test.csv")
ids = df_test["id"]
X_test = df_test.drop(columns=["id"])

# ──────────────────────────────────────────────────────────────────────
# 2a.  If the pipeline is still in memory
# ──────────────────────────────────────────────────────────────────────
test_preds = pipe_nn.predict(X_test)

# ---------- OR ---------- (uncomment if you need to reload) ----------
# from mlflow import sklearn as mlflow_sklearn
# pipe_nn = mlflow_sklearn.load_model("runs:/<RUN_ID>/model")
# test_preds = pipe_nn.predict(X_test)
# ---------------------------------------------------------------------

# Ensure no negative calories (good practice for RMSLE submissions)
test_preds = np.maximum(test_preds, 0)

# ──────────────────────────────────────────────────────────────────────
# 3.  Build and save the submission file
# ──────────────────────────────────────────────────────────────────────
submission = pd.DataFrame({"id": ids, "Calories": test_preds})
submission.to_csv("submission_nn.csv", index=False)

print("✅ submission_nn.csv written (shape:", submission.shape, ")")

In [3]:
import optuna
from optuna.pruners import MedianPruner
from sklearn.model_selection import StratifiedKFold

# ------------------------------------------------------------------
# 1.  Stratified K-fold (bin calories into quartiles)
# ------------------------------------------------------------------
y_bins = pd.qcut(y, 4, labels=False)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ------------------------------------------------------------------
# 2.  Optuna study with pruning and MLflow logging
# ------------------------------------------------------------------
mlflow.set_experiment("Calories-NN-Optuna-V2")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

with mlflow.start_run(run_name="optuna_search"):

    def objective(trial):

        # ---- hyper-params ----
        l2_coeff = trial.suggest_float("l2", 1e-6, 1e-3, log=True)
        params = {
            "model__lr": trial.suggest_float("lr", 1e-4, 3e-3, log=True),
            "model__n1": trial.suggest_int("n1", 64, 256, step=32),
            "model__n2": trial.suggest_int("n2", 32, 128, step=32),
            "model__dropout": trial.suggest_float("dropout", 0.2, 0.6),
            "model__epochs": trial.suggest_int("epochs", 25, 60),
            "model__batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
            "model__kernel_regularizer": regularizers.l2(l2_coeff),
            "model__verbose": 0,                      # silence CV folds
        }

        # ---- fresh model ----
        trial_pipe = clone(pipe_nn).set_params(**params)

        # ---- inner MLflow child run ----
        with mlflow.start_run(nested=True):
            cv_score = cross_val_score(
                trial_pipe, X, y,
                scoring=rmsle_scorer,
                cv=cv.split(X, y_bins),
                n_jobs=1
            ).mean()

            mlflow.log_metric("cv_rmsle", -cv_score)   # positive for UI
            mlflow.log_param("l2", l2_coeff)
            mlflow.log_params({k: v for k, v in params.items()
                               if k != "model__kernel_regularizer"})

        # Optuna still maximises −RMSLE
        return cv_score

    study = optuna.create_study(
        direction="maximize",
        pruner=MedianPruner(n_warmup_steps=4),   # wait 4 trials before pruning
        sampler=optuna.samplers.TPESampler(multivariate=True)
    )
    study.optimize(objective, n_trials=30, show_progress_bar=True)

    mlflow.log_metric("best_cv_rmsle", -study.best_value)

print("Best CV RMSLE:", -study.best_value)

2025/05/26 23:07:17 INFO mlflow.tracking.fluent: Experiment with name 'Calories-NN-Optuna-V2' does not exist. Creating a new experiment.
[I 2025-05-26 23:07:17,581] A new study created in memory with name: no-name-727c6911-c4d1-47ea-8c10-d06f8b3f00f9


  0%|          | 0/30 [00:00<?, ?it/s]



















🏃 View run treasured-newt-50 at: http://127.0.0.1:5000/#/experiments/24/runs/c5797d7b4a824990afa6cc465b81b1e4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/24
[I 2025-05-26 23:22:58,594] Trial 0 finished with value: -0.06468254666069691 and parameters: {'l2': 0.00025274841238301794, 'lr': 0.00035738964092190507, 'n1': 256, 'n2': 128, 'dropout': 0.4403837343001553, 'epochs': 30, 'batch_size': 128}. Best is trial 0 with value: -0.06468254666069691.
🏃 View run classy-cat-631 at: http://127.0.0.1:5000/#/experiments/24/runs/7426af05967646f4ae1ef9686300b5d0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/24
[I 2025-05-26 23:32:32,823] Trial 1 finished with value: -0.0721242905043702 and parameters: {'l2': 0.00013184071939038118, 'lr': 0.00010381050857881151, 'n1': 64, 'n2': 32, 'dropout': 0.47384858631965715, 'epochs': 55, 'batch_size': 256}. Best is trial 0 with value: -0.06468254666069691.
🏃 View run invincible-hawk-860 at: http://127.0.0.1:5000/#/experiments/24/runs/

In [None]:
# ──────────────────────────────────────────────────────────────────────
# 8.  Fit best model on FULL training data, save to MLflow
# ──────────────────────────────────────────────────────────────────────
#   (here we simply use the default hyper-params; plug study.best_params instead)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-NN-Final")

with mlflow.start_run(run_name="nn_full_fit"):
    pipe_nn.fit(X, y)
    mlflow.sklearn.log_model(pipe_nn, artifact_path="model")
    mlflow.log_metric("full_train_rmsle", rmsle_np(y, pipe_nn.predict(X)))
    print("Model logged to MLflow ✅")