In [31]:
!kaggle competitions download -c playground-series-s5e5

playground-series-s5e5.zip: Skipping, found more recently modified local copy (use --force to force download)


In [32]:
!unzip playground-series-s5e5.zip

Archive:  playground-series-s5e5.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [28]:
import pandas as pd
df_train = pd.read_csv("./train.csv").drop(["id"], axis = 1)
df_test = pd.read_csv("./test.csv")
id_test = df_test["id"]
df_test = df_test.drop(["id"], axis= 1)
df_train.head(5)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [29]:
X, y = df_train.drop(["Duration"], axis = 1), df_train["Duration"]
X_test = df_test

In [30]:
num_col = [col for col in X_test.columns if X_test[col].dtype != "O" and col !="Duration"]
cat_col = [col for col in X_test.columns if X_test[col].dtype == "O" and col !="Duration"]

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor
import joblib

# Preprocessing pipelines for numeric and categorical features
num_pipe = SkPipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", StandardScaler())
])

cat_pipe = SkPipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encode", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipe, num_col),
    ("cat", cat_pipe, cat_col)
])

cat_dict = {
    'iterations': 854,
    'depth': 9,
    'learning_rate': 0.09593085470665648,
    'l2_leaf_reg': 1.9207743346721886,
    'verbose': 0  # optional: suppress output
}
catboost_model = CatBoostRegressor(**cat_dict)

full_pipeline = SkPipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", catboost_model)
])

full_pipeline.fit(X, y)  # make sure X_raw is unprocessed (with both num and cat cols)

joblib.dump(full_pipeline, "workout_pred_pipe.pkl")



['catboost_full_pipeline.pkl']

In [5]:
import optuna
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from catboost import CatBoostRegressor

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

def optimize_catboost(X_train, y_train):
    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "depth": trial.suggest_int("depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        }
        model = CatBoostRegressor(**params, random_state=42, verbose=0)
        inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
        score = cross_val_score(model, X_train, y_train, cv=inner_cv, scoring=scorer, n_jobs=-1)
        return np.mean(score)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30, timeout=600)

    print("Best hyperparameters:", study.best_params)
    print("Best inner CV score (MAE):", -study.best_value)
    return study.best_params


# Outer loop for nested CV
outer_scores = []

for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    print(f"\n=== Outer Fold {fold+1}/5 ===")
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]

    # Tune model using inner CV
    best_params = optimize_catboost(X_train, y_train)

    # Train best model on full inner training set
    model = CatBoostRegressor(**best_params, random_state=42, verbose=0)
    model.fit(X_train, y_train)

    # Evaluate on validation fold
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    outer_scores.append(score)
    print(f"Fold MAE: {score:.4f}")

print(f"\nFinal MAE across folds: {np.mean(outer_scores):.4f} ± {np.std(outer_scores):.4f}")


[I 2025-07-20 17:20:05,343] A new study created in memory with name: no-name-16f95ebf-e43c-4198-90a7-5b84bdea7632



=== Outer Fold 1/5 ===


[I 2025-07-20 17:20:46,514] Trial 0 finished with value: -1.610768588082875 and parameters: {'iterations': 918, 'depth': 9, 'learning_rate': 0.23799920395962695, 'l2_leaf_reg': 0.3694222599375157}. Best is trial 0 with value: -1.610768588082875.
[I 2025-07-20 17:21:16,159] Trial 1 finished with value: -1.60742660824305 and parameters: {'iterations': 798, 'depth': 7, 'learning_rate': 0.11952681177354277, 'l2_leaf_reg': 0.00838214549462687}. Best is trial 1 with value: -1.60742660824305.
[I 2025-07-20 17:21:42,888] Trial 2 finished with value: -1.616397850554964 and parameters: {'iterations': 951, 'depth': 4, 'learning_rate': 0.07560948489133629, 'l2_leaf_reg': 0.23552202411654238}. Best is trial 1 with value: -1.60742660824305.
[I 2025-07-20 17:21:59,120] Trial 3 finished with value: -1.6107689186477014 and parameters: {'iterations': 433, 'depth': 7, 'learning_rate': 0.11784788290486062, 'l2_leaf_reg': 0.03494257140639944}. Best is trial 1 with value: -1.60742660824305.
[I 2025-07-20 17

Best hyperparameters: {'iterations': 872, 'depth': 9, 'learning_rate': 0.16115012512815505, 'l2_leaf_reg': 4.469648991948942}
Best inner CV score (MAE): 1.6052287294267422
Fold MAE: 1.5996

=== Outer Fold 2/5 ===


[I 2025-07-20 17:30:39,079] A new study created in memory with name: no-name-235aac0e-15e3-420b-ac4e-d1e516fde6a9
[I 2025-07-20 17:30:43,566] Trial 0 finished with value: -1.7345299760802302 and parameters: {'iterations': 153, 'depth': 3, 'learning_rate': 0.022728198821024732, 'l2_leaf_reg': 0.7602536549394665}. Best is trial 0 with value: -1.7345299760802302.
[I 2025-07-20 17:31:06,555] Trial 1 finished with value: -1.617727869806071 and parameters: {'iterations': 748, 'depth': 5, 'learning_rate': 0.05770014541038737, 'l2_leaf_reg': 0.47657668467452735}. Best is trial 1 with value: -1.617727869806071.
[I 2025-07-20 17:31:30,292] Trial 2 finished with value: -1.6248346670007117 and parameters: {'iterations': 802, 'depth': 5, 'learning_rate': 0.01891244086063168, 'l2_leaf_reg': 0.004265036198062904}. Best is trial 1 with value: -1.617727869806071.
[I 2025-07-20 17:31:44,791] Trial 3 finished with value: -1.6114761571528013 and parameters: {'iterations': 415, 'depth': 7, 'learning_rate':

Best hyperparameters: {'iterations': 874, 'depth': 10, 'learning_rate': 0.14210933067871875, 'l2_leaf_reg': 3.2281315401361868}
Best inner CV score (MAE): 1.6051870927205447


[I 2025-07-20 17:41:29,273] A new study created in memory with name: no-name-dd79fc0b-9db7-45ee-9870-ab68ee6c198e


Fold MAE: 1.5978

=== Outer Fold 3/5 ===


[I 2025-07-20 17:41:51,632] Trial 0 finished with value: -1.614084995918951 and parameters: {'iterations': 824, 'depth': 6, 'learning_rate': 0.0496370059299021, 'l2_leaf_reg': 0.10319461418840263}. Best is trial 0 with value: -1.614084995918951.
[I 2025-07-20 17:42:17,230] Trial 1 finished with value: -1.606902042036255 and parameters: {'iterations': 886, 'depth': 7, 'learning_rate': 0.12937393044826895, 'l2_leaf_reg': 0.14905293184556778}. Best is trial 1 with value: -1.606902042036255.
[I 2025-07-20 17:42:40,323] Trial 2 finished with value: -1.621695051347002 and parameters: {'iterations': 642, 'depth': 10, 'learning_rate': 0.013072467014170086, 'l2_leaf_reg': 1.199204138009261}. Best is trial 1 with value: -1.606902042036255.
[I 2025-07-20 17:43:02,752] Trial 3 finished with value: -1.626606596529685 and parameters: {'iterations': 815, 'depth': 6, 'learning_rate': 0.011874879529117258, 'l2_leaf_reg': 0.010729351842529957}. Best is trial 1 with value: -1.606902042036255.
[I 2025-07-

Best hyperparameters: {'iterations': 985, 'depth': 9, 'learning_rate': 0.07445657231527883, 'l2_leaf_reg': 0.02957894364083346}
Best inner CV score (MAE): 1.6051337652809572


[I 2025-07-20 17:51:51,233] A new study created in memory with name: no-name-2ea86869-da87-4af2-9ae0-d24a3e828a13


Fold MAE: 1.6030

=== Outer Fold 4/5 ===


[I 2025-07-20 17:52:21,847] Trial 0 finished with value: -1.6193549883601739 and parameters: {'iterations': 924, 'depth': 6, 'learning_rate': 0.020986833609293067, 'l2_leaf_reg': 0.0024980872241516425}. Best is trial 0 with value: -1.6193549883601739.
[I 2025-07-20 17:52:44,043] Trial 1 finished with value: -1.61210035562606 and parameters: {'iterations': 787, 'depth': 4, 'learning_rate': 0.16809417269871668, 'l2_leaf_reg': 1.5883643186326688}. Best is trial 1 with value: -1.61210035562606.
[I 2025-07-20 17:53:09,198] Trial 2 finished with value: -1.6054115247339968 and parameters: {'iterations': 593, 'depth': 9, 'learning_rate': 0.22763016493862873, 'l2_leaf_reg': 3.8145477276201687}. Best is trial 2 with value: -1.6054115247339968.
[I 2025-07-20 17:53:21,750] Trial 3 finished with value: -1.6104619338771744 and parameters: {'iterations': 285, 'depth': 9, 'learning_rate': 0.0981666022417392, 'l2_leaf_reg': 1.2034587435288178}. Best is trial 2 with value: -1.6054115247339968.
[I 2025-0

Best hyperparameters: {'iterations': 591, 'depth': 9, 'learning_rate': 0.15760279674282912, 'l2_leaf_reg': 2.9478989943307874}
Best inner CV score (MAE): 1.6046015366126785


[I 2025-07-20 18:02:13,499] A new study created in memory with name: no-name-56b98783-4fb7-4ab5-9c7a-925f13dd7f0d


Fold MAE: 1.6068

=== Outer Fold 5/5 ===


[I 2025-07-20 18:02:26,525] Trial 0 finished with value: -1.6137629761554653 and parameters: {'iterations': 506, 'depth': 4, 'learning_rate': 0.24470364218471438, 'l2_leaf_reg': 0.0011644715198410808}. Best is trial 0 with value: -1.6137629761554653.
[I 2025-07-20 18:02:35,515] Trial 1 finished with value: -1.6195518936847382 and parameters: {'iterations': 334, 'depth': 4, 'learning_rate': 0.1231402231885677, 'l2_leaf_reg': 0.016817748419875937}. Best is trial 0 with value: -1.6137629761554653.
[I 2025-07-20 18:02:41,798] Trial 2 finished with value: -1.6192534819834077 and parameters: {'iterations': 136, 'depth': 10, 'learning_rate': 0.07918298369893469, 'l2_leaf_reg': 0.04659818564350507}. Best is trial 0 with value: -1.6137629761554653.
[I 2025-07-20 18:03:10,027] Trial 3 finished with value: -1.6059452211442107 and parameters: {'iterations': 692, 'depth': 9, 'learning_rate': 0.13467084493719597, 'l2_leaf_reg': 0.13673245647281168}. Best is trial 3 with value: -1.6059452211442107.
[

Best hyperparameters: {'iterations': 854, 'depth': 9, 'learning_rate': 0.09593085470665648, 'l2_leaf_reg': 1.9207743346721886}
Best inner CV score (MAE): 1.6050746373359008
Fold MAE: 1.6021

Final MAE across folds: 1.6019 ± 0.0031
