In [None]:
# Packages
import pandas as pd
import numpy as np
from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.metrics         import make_scorer
from sklearn.base            import clone
from xgboost                 import XGBRegressor
import joblib, mlflow
import mlflow.sklearn
import optuna
import warnings
from optuna.integration import OptunaSearchCV, MLflowCallback
from optuna.distributions import FloatDistribution, IntDistribution

# Custom Metric for Training Feedback
def rmsle_xgb(preds, dtrain):
    y_true = dtrain.get_label()
    preds = np.maximum(preds, 0)
    rmsle = np.sqrt(np.mean((np.log1p(preds) - np.log1p(y_true)) ** 2))
    return 'rmsle', rmsle

# Custom Metric for GridSearch (wrapped in make_scorer)
def rmsle_sklearn(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_sklearn, greater_is_better=False)
# Data
df = pd.read_csv('playground-series-s5e5/train.csv')
y = df['Calories']
X = df.drop(columns=(['Calories', 'id']))

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=X['Sex']
)

# Custom Feature Engineering
def add_bmi_intensity(X_df: pd.DataFrame) -> pd.DataFrame:
    """Adds BMI and HeartRatexDuration features"""
    X = X_df.copy()
    X['BMI'] = (X['Weight'] / (X['Height'] / 100) ** 2).round(2)
    X['Timed_Intensity'] = X['Duration'] * X['Heart_Rate']
    X['Heart_Rate_Zone'] = (
        X['Heart_Rate'] / (220 - X['Age'])
    ) * 100
    X['Mifflin_Jeor_BMR'] = np.where(
        X['Sex'] == 'male',
        (10 * X['Weight']) + (6.25 * X['Height']) - (5 * X['Age']) + 5,
        (10 * X['Weight']) + (6.25 * X['Height']) - (5 * X['Age']) - 161,
    )
    return X

feat_eng = FunctionTransformer(add_bmi_intensity, validate=False)

# Preprocessor
cat_col = ['Sex']
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_col)
    ],
    remainder='passthrough'
)
# ──────────────────────────────────────────────────────────────────────
# MODEL & PIPELINE
# ──────────────────────────────────────────────────────────────────────
xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8
)

pipe = Pipeline(steps=[
        ("feat_eng",   feat_eng),
        ("preprocess", preprocess),
        ("model",      xgb)
])

# ──────────────────────────────────────────────────────────────────────
# GRID  (prefix params with model__)
# ──────────────────────────────────────────────────────────────────────
cv = KFold(n_splits=5, shuffle=True, random_state=0)

def objective(trial: optuna.Trial) -> float:
    params = {
        "model__max_depth":        trial.suggest_int("model__max_depth", 3, 10),
        "model__learning_rate":    trial.suggest_float("model__learning_rate", 1e-3, 0.2, log=True),
        "model__n_estimators":     trial.suggest_int("model__n_estimators", 400, 1600),
        "model__subsample":        trial.suggest_float("model__subsample", 0.6, 1.0),
        "model__colsample_bytree": trial.suggest_float("model__colsample_bytree", 0.6, 1.0),
        "model__reg_alpha":        trial.suggest_float("model__reg_alpha", 1e-4, 10.0, log=True),
        "model__reg_lambda":       trial.suggest_float("model__reg_lambda", 1e-3, 10.0, log=True),
        "model__min_child_weight": trial.suggest_float("model__min_child_weight", 1e-2, 10.0, log=True),
    }

    pipe_trial = clone(pipe).set_params(**params)

    scores = cross_val_score(
        pipe_trial,
        X_train, y_train,
        scoring=rmsle_scorer,
        cv=cv,
        n_jobs=-1,
    )
    return scores.mean()

# ──────────────────────────────────────────────────────────────────────
# MLFLOW SETUP
# ──────────────────────────────────────────────────────────────────────
'''
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-XGB-Optuna-V3")
mlflow.sklearn.autolog(log_models=False)

study = optuna.create_study(direction="maximize", study_name="xgb_rmsle_1")

with mlflow.start_run(run_name="optuna_parent"):
    mlflow_cb = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(),
        metric_name="neg_rmsle_cv",
        mlflow_kwargs={"nested": True}
    )

    study.optimize(
        objective,
        n_trials=150,
        callbacks=[mlflow_cb],
        show_progress_bar=True
    )

    mlflow.log_params(study.best_trial.params)
    mlflow.log_metric("best_neg_rmsle_cv", study.best_value)
'''

'\nmlflow.set_tracking_uri("http://127.0.0.1:5000")\nmlflow.set_experiment("Calories-XGB-Optuna-V3")\nmlflow.sklearn.autolog(log_models=False)\n\nstudy = optuna.create_study(direction="maximize", study_name="xgb_rmsle_1")\n\nwith mlflow.start_run(run_name="optuna_parent"):\n    mlflow_cb = MLflowCallback(\n        tracking_uri=mlflow.get_tracking_uri(),\n        metric_name="neg_rmsle_cv",\n        mlflow_kwargs={"nested": True}\n    )\n\n    study.optimize(\n        objective,\n        n_trials=150,\n        callbacks=[mlflow_cb],\n        show_progress_bar=True\n    )\n\n    mlflow.log_params(study.best_trial.params)\n    mlflow.log_metric("best_neg_rmsle_cv", study.best_value)\n'

In [None]:
# ──────────────────────────────────────────────────────────────────────
# GENERATING OOF PREDICTIONS AND A VAL SCORE
# ──────────────────────────────────────────────────────────────────────
from pathlib import Path
from tqdm.notebook import tqdm
from IPython.display import display, clear_output

FE_VERSION = 'v4_hrzone_bmr'

best_params = {
    "model__max_depth": 10,
    "model__learning_rate": 0.02350055429195408,
    "model__n_estimators": 1236,
    "model__subsample": 0.9920046553420347,
    "model__colsample_bytree": 0.7191809335449328,
    "model__reg_alpha": 4.560438869008341,
    "model__reg_lambda": 3.5346229827585867,
    "model__min_child_weight": 0.08981320070110384 
}

best_pipe = clone(pipe).set_params(**best_params)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_table = []
oof_xgb = np.empty(len(X))

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-XGB-OOF-VAL-V2")

with mlflow.start_run(run_name="xgb_oof_fit",
                      tags={"fe_version": FE_VERSION}):
    
    for fold, (tr_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=kf.get_n_splits()), 1):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        fold_model = clone(best_pipe).fit(X_tr, y_tr)
        preds = fold_model.predict(X_val)

        oof_xgb[val_idx] = preds

        fold_rmsle = rmsle_sklearn(y_val, preds)
        mlflow.log_metric(f"fold{fold}_rmsle", fold_rmsle)

        fold_table.append({"fold": fold, "rmsle": fold_rmsle})
        clear_output(wait=True)
        display(pd.DataFrame(fold_table))

    cv_rmsle = np.mean([row["rmsle"] for row in fold_table])
    mlflow.log_metric("cv_rmsle", cv_rmsle)
    print(f"5-fold CV RMSLE: {cv_rmsle:.5f}")

    Path("oof").mkdir(exist_ok=True)
    np.save("oof/oof_xgb.npy", oof_xgb)
    mlflow.log_artifact("oof/oof_xgb.npy", artifact_path="oof")

Unnamed: 0,fold,rmsle
0,1,0.060138
1,2,0.060464
2,3,0.059776
3,4,0.061001
4,5,0.060293


2025/05/26 00:34:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgb_oof_fit at: http://127.0.0.1:5000/#/experiments/15/runs/218fddecf5604ade89ce336140857574.
2025/05/26 00:34:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/15.


5-Fold CV RMLSE: 0.06033


In [None]:
# FINAL-MODEL RUN  ──────────────────────────────────────────────
with mlflow.start_run(run_name="final_rmsle_model_optuna") as run:

    best_params = {
    "model__max_depth": 10,
    "model__learning_rate": 0.02350055429195408,
    "model__n_estimators": 1236,
    "model__subsample": 0.9920046553420347,
    "model__colsample_bytree": 0.7191809335449328,
    "model__reg_alpha": 4.560438869008341,
    "model__reg_lambda": 3.5346229827585867,
    "model__min_child_weight": 0.08981320070110384
    }

    # fresh clone to avoid any state leakage from grid-search
    best_pipe = clone(pipe).set_params(
            **best_params
    )

    # concatenating to train on the full dataset
    X_full = pd.concat([X_train, X_val])
    y_full = pd.concat([y_train, y_val])
    
    best_pipe.fit(X_full, y_full)

    mlflow.sklearn.log_model(best_pipe, artifact_path="model")

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"
2025/05/24 15:16:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run final_rmsle_model_optuna at: http://127.0.0.1:5000/#/experiments/8/runs/bfdc77e266614b8d9d642cc56e14816b.
2025/05/24 15:16:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/8.
