In [4]:
# Packages
import pandas as pd
import numpy as np
from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.metrics         import make_scorer
from sklearn.base            import clone
from xgboost                 import XGBRegressor
import joblib, mlflow
import mlflow.sklearn
import optuna
import warnings
from optuna.integration import OptunaSearchCV, MLflowCallback
from optuna.distributions import FloatDistribution, IntDistribution

In [5]:

# Custom Metric for Training Feedback
def rmsle_xgb(preds, dtrain):
    y_true = dtrain.get_label()
    preds = np.maximum(preds, 0)
    rmsle = np.sqrt(np.mean((np.log1p(preds) - np.log1p(y_true)) ** 2))
    return 'rmsle', rmsle

# Custom Metric for GridSearch (wrapped in make_scorer)
def rmsle_sklearn(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

rmsle_scorer = make_scorer(rmsle_sklearn, greater_is_better=False)
# Data
df = pd.read_csv('playground-series-s5e5/train.csv')
y = df['Calories']
X = df.drop(columns=(['Calories', 'id']))

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=X['Sex']
)

# Custom Feature Engineering
def add_bmi_intensity(X_df: pd.DataFrame) -> pd.DataFrame:
    """Adds BMI and HeartRatexDuration features"""
    X = X_df.copy()
    X['BMI'] = (X['Weight'] / (X['Height'] / 100) ** 2).round(2)
    X['Timed_Intensity'] = X['Duration'] * X['Heart_Rate']
    return X

feat_eng = FunctionTransformer(add_bmi_intensity, validate=False)

# Preprocessor
cat_col = ['Sex']
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_col)
    ],
    remainder='passthrough'
)

In [6]:
# ──────────────────────────────────────────────────────────────────────
# MODEL & PIPELINE
# ──────────────────────────────────────────────────────────────────────
xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8
)

pipe = Pipeline(steps=[
        ("feat_eng",   feat_eng),
        ("preprocess", preprocess),
        ("model",      xgb)
])

# ──────────────────────────────────────────────────────────────────────
# GRID  (prefix params with model__)
# ──────────────────────────────────────────────────────────────────────
cv = KFold(n_splits=5, shuffle=True, random_state=0)

def objective(trial: optuna.Trial) -> float:
    params = {
        "model__max_depth":        trial.suggest_int("model__max_depth", 3, 10),
        "model__learning_rate":    trial.suggest_float("model__learning_rate", 1e-3, 0.2, log=True),
        "model__n_estimators":     trial.suggest_int("model__n_estimators", 400, 1600),
        "model__subsample":        trial.suggest_float("model__subsample", 0.6, 1.0),
        "model__colsample_bytree": trial.suggest_float("model__colsample_bytree", 0.6, 1.0),
        "model__reg_alpha":        trial.suggest_float("model__reg_alpha", 1e-4, 10.0, log=True),
        "model__reg_lambda":       trial.suggest_float("model__reg_lambda", 1e-3, 10.0, log=True),
        "model__min_child_weight": trial.suggest_float("model__min_child_weight", 1e-2, 10.0, log=True),
    }

    pipe_trial = clone(pipe).set_params(**params)

    scores = cross_val_score(
        pipe_trial,
        X_train, y_train,
        scoring=rmsle_scorer,
        cv=cv,
        n_jobs=-1,
    )
    return scores.mean()

In [None]:
# ──────────────────────────────────────────────────────────────────────
# MLFLOW SETUP
# ──────────────────────────────────────────────────────────────────────
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Calories-XGB-Optuna-V1")
mlflow.sklearn.autolog(log_models=False)

'''
study = optuna.create_study(direction="maximize", study_name="xgb_rmsle_1")

with mlflow.start_run(run_name="optuna_parent"):
    mlflow_cb = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(),
        metric_name="neg_rmsle_cv",
        mlflow_kwargs={"nested": True}
    )

    study.optimize(
        objective,
        n_trials=150,
        callbacks=[mlflow_cb],
        show_progress_bar=True
    )

    mlflow.log_params(study.best_trial.params)
    mlflow.log_metric("best_neg_rmsle_cv", study.best_value)
'''

In [None]:
    
# FINAL-MODEL RUN  ──────────────────────────────────────────────
with mlflow.start_run(run_name="final_rmsle_model_optuna") as run:

    best_params = {
    "model__max_depth": 9,
    "model__learning_rate": 0.023658702935574594,
    "model__n_estimators": 1373,
    "model__subsample": 0.9815205434675509,
    "model__colsample_bytree": 0.8460760959011027,
    "model__reg_alpha": 0.4849891315002537,
    "model__reg_lambda": 1.6645566204520426,
    "model__min_child_weight": 0.37163348840668015
    }

    # fresh clone to avoid any state leakage from grid-search
    best_pipe = clone(pipe).set_params(
            **best_params
    )

    # concatenating to train on the full dataset
    X_full = pd.concat([X_train, X_val])
    y_full = pd.concat([y_train, y_val])
    
    best_pipe.fit(X_full, y_full)

    mlflow.sklearn.log_model(best_pipe, artifact_path="model")

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
"
2025/05/20 20:53:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run final_rmsle_model_optuna at: http://127.0.0.1:5000/#/experiments/7/runs/4d6d9579b71d4906ac6c0a111e83f79f.
2025/05/20 20:53:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/7.


In [None]:
# ──────────────────────────────────────────────────────────────────────
# TEST SET RUN
# ──────────────────────────────────────────────────────────────────────

df_test = pd.read_csv("playground-series-s5e5/test.csv")
ids = df_test["id"]

X_test = df_test.drop(columns=["id"])
test_preds = best_pipe.predict(X_test)
submission = pd.DataFrame({
    "id": ids,
    "Calories": test_preds
})

import os
if os.path.isfile("submission_2.csv"):
    pass
else:
    submission.to_csv("submission_2.csv", index=False)