In [8]:
# Packages
import pandas as pd
import numpy as np
from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics         import make_scorer
from sklearn.base            import clone
from xgboost                 import XGBRegressor
import joblib, mlflow
import mlflow.sklearn

In [None]:

# Custom Metric for Training Feedback
def rmsle_xgb(preds, dtrain):
    y_true = dtrain.get_label()
    preds = np.maximum(preds, 0)
    rmsle = np.sqrt(np.mean((np.log1p(preds) - np.log1p(y_true)) ** 2))
    return 'rmsle', rmsle

# Custom Metric for GridSearch (wrapped in make_scorer)
def rmsle_sklearn(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_sklearn, greater_is_better=False)
# Data
df = pd.read_csv('playground-series-s5e5/train.csv')
y = df['Calories']
X = df.drop(columns=(['Calories', 'id']))

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=X['Sex']
)

# Custom Feature Engineering
def add_bmi_intensity(X_df: pd.DataFrame) -> pd.DataFrame:
    """Adds BMI and HeartRatexDuration features"""
    X = X_df.copy()
    X['BMI'] = (X['Weight'] / (X['Height'] / 100) ** 2).round(2)
    X['Timed_Intensity'] = X['Duration'] * X['Heart_Rate']
    return X

feat_eng = FunctionTransformer(add_bmi_intensity, validate=False)

# Preprocessor
cat_col = ['Sex']
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_col)
    ],
    remainder='passthrough'
)

In [2]:
# ──────────────────────────────────────────────────────────────────────
# MODEL & PIPELINE
# ──────────────────────────────────────────────────────────────────────
xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8
)

pipe = Pipeline(steps=[
        ("feat_eng",   feat_eng),
        ("preprocess", preprocess),
        ("model",      xgb)
])

# ──────────────────────────────────────────────────────────────────────
# GRID  (prefix params with model__)
# ──────────────────────────────────────────────────────────────────────
param_grid = {
    "model__max_depth":        [4, 6, 8],
    "model__learning_rate":    [0.03, 0.05, 0.07],
    "model__n_estimators":     [600, 800, 1000],
    "model__subsample":        [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
}

gcv = GridSearchCV(
        estimator   = pipe,
        param_grid  = param_grid,
        scoring     = rmsle_scorer,     
        cv          = 5,                
        n_jobs      = -1,
        verbose     = 2,             
)

In [5]:
# ──────────────────────────────────────────────────────────────────────
# MLFLOW SETUP
# ──────────────────────────────────────────────────────────────────────
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-XGB-GridSearch")

mlflow.sklearn.autolog(log_model_signatures=True,
                       log_input_examples=False,
                       disable=False)

2025/05/16 21:12:25 INFO mlflow.tracking.fluent: Experiment with name 'Calories-XGB-GridSearch' does not exist. Creating a new experiment.


In [6]:
# GRID-SEARCH RUN  ──────────────────────────────────────────────
with mlflow.start_run(run_name="gridsearch_rmsle_parent"):

    mlflow.sklearn.autolog(log_models=False)
    gcv.fit(X_train, y_train)
    mlflow.log_params(gcv.best_params_)
    mlflow.log_metric("best_neg_rmsle_cv", gcv.best_score_)

best_params = gcv.best_params_



Fitting 5 folds for each of 108 candidates, totalling 540 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"
2025/05/16 22:31:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, 103 runs will be omitted.
2025/05/16 22:31:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run skittish-doe-403 at: http://127.0.0.1:5000/#/experiments/1/runs/9781b448407c45f1894fc106ea7f0c82.
2025/05/16 22:31:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
2025/05/16 22:31:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run loud-pig-332 at: http://127.0.0.1:5000/#/experiments/1/runs/d73fa9e768704d8098b3c337f10ca369.
2025/05/16 22:31:25 INFO mlflow.tracking._tracking_service.client: 🧪 View

In [18]:
# FINAL-MODEL RUN  ──────────────────────────────────────────────
with mlflow.start_run(run_name="final_rmsle_model") as run:

    mlflow.log_params(best_params)

    # fresh clone to avoid any state leakage from grid-search
    best_pipe = clone(pipe).set_params(
            **best_params,
            model__eval_metric=rmsle_xgb,
            model__verbose=10
    )
    
    # train with early-stopping feedback on the 20 % validation split
    best_pipe.fit(X_train, y_train)

    # record the best_iteration and validation RMSLE
    val_pred = best_pipe.predict(X_val)
    mlflow.log_metric("val_rmsle", rmsle_sklearn(y_val, val_pred))

    mlflow.sklearn.log_model(best_pipe, artifact_path="model")

Parameters: { "verbose" } are not used.

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"
2025/05/17 23:41:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run final_rmsle_model at: http://127.0.0.1:5000/#/experiments/1/runs/26059eabeb6c4a95b3d797488a211a64.
2025/05/17 23:41:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [26]:
# ──────────────────────────────────────────────────────────────────────
# TEST SET RUN
# ──────────────────────────────────────────────────────────────────────

df_test = pd.read_csv("playground-series-s5e5/test.csv")
ids = df_test["id"]

X_test = df_test.drop(columns=["id"])
test_preds = best_pipe.predict(X_test)
submission = pd.DataFrame({
    "id": ids,
    "Calories": test_preds
})

import os
if os.path.isfile("submission.csv"):
    pass
else:
    submission.to_csv("submission.csv", index=False)

