# Single XGBoost model

In [18]:
import sys
import os

# Adjust the path to point to your project root (the folder containing 'src')
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_log_error
import optuna
import importlib
import src.preprocessing as prep
importlib.reload(prep)

<module 'src.preprocessing' from '/Users/wpw555/Documents/Kaggle/playground_s5e5/src/preprocessing.py'>

In [35]:
# Set paths
DATA_PATH = '../data/raw/train.csv'
PREPROCESSOR_PATH = '../src/preprocessor.pkl'

# Load the data
data = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {data.shape}")
data.head()

Dataset shape: (750000, 9)


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [36]:
data.columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')

In [37]:
# Clean data
cleaned_data, caps = prep.clean_train_data(data)

X = data.drop(columns=['id', 'Calories'])
X = prep.create_features(X)
y = data['Calories']

In [23]:
# Define the objective function for optimization
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "random_state": 42,
        "objective": "reg:squarederror",
        "enable_categorical": True
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmsle_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        # Build and fit preprocessor
        preprocessor = prep.build_preprocessor(X_train_fold)
        X_train_fold, X_val_fold = prep.fit_and_transform_preprocessor(preprocessor, X_train_fold, X_val_fold)

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold)

        preds = model.predict(X_val_fold)
        preds = np.maximum(preds, 0)

        score = np.sqrt(mean_squared_log_error(y_val_fold, preds))
        rmsle_scores.append(score)

    return np.mean(rmsle_scores)

In [24]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # try 20 different sets of hyperparameters

[I 2025-05-11 13:16:01,540] A new study created in memory with name: no-name-a8d7b1a5-c5ad-4c8a-b35e-2561d3087bcd
[I 2025-05-11 13:16:36,550] Trial 0 finished with value: 0.06833833677323572 and parameters: {'learning_rate': 0.05480679290129504, 'max_depth': 4, 'n_estimators': 633, 'subsample': 0.5275700943640604, 'colsample_bytree': 0.5133195306252352, 'gamma': 1.175836013243718}. Best is trial 0 with value: 0.06833833677323572.
[I 2025-05-11 13:16:57,999] Trial 1 finished with value: 0.0660003003400759 and parameters: {'learning_rate': 0.08654573334972851, 'max_depth': 5, 'n_estimators': 247, 'subsample': 0.5695343415909838, 'colsample_bytree': 0.7489823979953986, 'gamma': 1.2623936485977827}. Best is trial 1 with value: 0.0660003003400759.
[I 2025-05-11 13:17:38,851] Trial 2 finished with value: 0.06657474190873999 and parameters: {'learning_rate': 0.023205658396843125, 'max_depth': 5, 'n_estimators': 579, 'subsample': 0.5547527392178329, 'colsample_bytree': 0.862574881987568, 'gamm

In [25]:
print("Best trial:")
print("  RMSLE:", study.best_value)
print("  Params:", study.best_params)

Best trial:
  RMSLE: 0.0605498302987979
  Params: {'learning_rate': 0.021966655042749667, 'max_depth': 9, 'n_estimators': 986, 'subsample': 0.8838546077105147, 'colsample_bytree': 0.6610369452907909, 'gamma': 0.4694318343460074}


In [38]:
# Use full training set with best params
preprocessor = prep.build_preprocessor(X)
X_train = prep.fit_and_transform_full(preprocessor, X)

# Create and train the xgb model
best_model = xgb.XGBRegressor(**study.best_params, enable_categorical=True)
best_model.fit(X_train, y)

# Make predictions
y_train_pred = best_model.predict(X_train)

In [39]:
# Clip predictions to ensure they are non-negative
y_train_pred = np.clip(y_train_pred, 0, None)

In [40]:
# Evaluate the model
train_rmsle = np.sqrt(mean_squared_log_error(y, y_train_pred))
train_r2 = r2_score(y, y_train_pred)

print(f"Training RMSLE: {train_rmsle:.4f}")
print(f"Training R²: {train_r2:.4f}")

Training RMSLE: 0.0561
Training R²: 0.9977


In [31]:
# Save the model
joblib.dump(best_model, '../output/models/XGBoost_model_optuna4.pkl')
print("Model saved successfully!")

Model saved successfully!


In [41]:
# Load the test data
test_data = pd.read_csv('../data/raw/test.csv')

# Clean the test data
cleaned_test_data = prep.clean_test_data(test_data)

# Split into features and target
X_test = cleaned_test_data.drop(columns=['id'])
X_test = prep.create_features(X_test)

# Transform the test features using the fitted preprocessor
X_test_processed = preprocessor.transform(X_test)

# 2. Make predictions using the fitted model
y_test_pred = best_model.predict(X_test_processed)
y_test_pred = np.clip(y_test_pred, 0, None)

In [42]:
submission = pd.DataFrame({
    'id': test_data['id'],
    'Calories': y_test_pred
})
submission.to_csv('../output/submissions/XGBoost_optuna4.csv', index=False)