In [None]:
# Packages
import pandas as pd
import numpy as np
from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics         import make_scorer
from xgboost                 import XGBRegressor
import joblib, mlflow
import mlflow.sklearn

# Custom Metric for Training Feedback
def rmsle_xgb(preds, dtrain):
    y_true = dtrain.get_label()
    preds = np.maximum(preds, 0)
    rmsle = np.sqrt(np.mean((np.log1p(preds) - np.log1p(y_true)) ** 2))
    return 'rmsle', rmsle

# Custom Metric for GridSearch (wrapped in make_scorer)
def rmsle_sklearn(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_sklearn, greater_is_better=False)
# Data
df = pd.read_csv('playground-series-s5e5/train.csv')
y = df['Calories']
X = df.drop(columns=(['Calories', 'id']))

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=X['Sex']
)

# Custom Feature Engineering
def add_bmi_intensity(X_df: pd.DataFrame) -> pd.DataFrame:
    """Adds BMI and HeartRatexDuration features"""
    X = X_df.copy()
    X['BMI'] = (X['Weight'] / (X['Height'] / 100) ** 2).round(2)
    X['Timed_Intensity'] = X['Duration'] * X['Heart_Rate']
    return X

feat_eng = FunctionTransformer(add_bmi_intensity, validate=False)

# Preprocessor
cat_col = ['Sex']
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_col)
    ],
    remainder='passthrough'
)

In [None]:
# ──────────────────────────────────────────────────────────────────────
# MODEL & PIPELINE
# ──────────────────────────────────────────────────────────────────────
xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8
)

pipe = Pipeline(steps=[
        ("feat_eng",   feat_eng),
        ("preprocess", preprocess),
        ("model",      xgb)
])

# ──────────────────────────────────────────────────────────────────────
# GRID  (prefix params with model__)
# ──────────────────────────────────────────────────────────────────────
param_grid = {
    "model__max_depth":        [4, 6, 8],
    "model__learning_rate":    [0.03, 0.05, 0.07],
    "model__n_estimators":     [600, 800, 1000],
    "model__subsample":        [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
}

gcv = GridSearchCV(
        estimator   = pipe,
        param_grid  = param_grid,
        scoring     = rmsle_scorer,     
        cv          = 5,                
        n_jobs      = -1,
        verbose     = 2,             
)

In [None]:
# ──────────────────────────────────────────────────────────────────────
# MLFLOW SETUP
# ──────────────────────────────────────────────────────────────────────
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-XGB-GridSearch")

mlflow.sklearn.autolog(log_model_signatures=True,
                       log_input_examples=False,
                       disable=False)

In [None]:

with mlflow.start_run(run_name="gridsearch_rmsle_parent"):

    # hyper-parameter tuning  ────────────────────────────────
    gcv.fit(X_train, y_train)
    mlflow.log_params(gcv.best_params_)
    mlflow.log_metric("best_neg_rmsle_cv", gcv.best_score_)

    best_pipe = gcv.best_estimator_
    # early-stopping refit on same train, watch val ──────────
    best_pipe.set_params(
            model__early_stopping_rounds=50,
            model__eval_metric=rmsle_xgb,
            model__verbose=False
    )
    best_pipe.fit(X_train, y_train,
                  model__eval_set=[(X_val, y_val)])

    val_pred  = best_pipe.predict(X_val)
    val_rmsle = rmsle_sklearn(y_val, val_pred)
    mlflow.log_metric("val_rmsle", val_rmsle)

    #  store top-20 cv table
    import json, pathlib
    top20 = (pd.DataFrame(gcv.cv_results_)
               .loc[:, ["params", "mean_test_score", "rank_test_score"]]
               .sort_values("rank_test_score")
               .head(20)
               .to_json(orient="records", indent=2))
    mlflow.log_text(top20, artifact_file="top20_cv_results.json")

    # save final artefact  ───────────────────────────────────
    mlflow.sklearn.log_model(best_pipe, artifact_path="best_pipeline")

# ──────────────────────────────────────────────────────────────────────
# OFFLINE SERIALISATION
# ──────────────────────────────────────────────────────────────────────
joblib.dump(best_pipe, "calories_xgb_rmsle_pipeline.joblib")