# Single XGBoost model

In [53]:
import sys
import os

# Adjust the path to point to your project root (the folder containing 'src')
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_log_error, mean_squared_error
import optuna
import importlib
import src.preprocessing as prep
importlib.reload(prep)

<module 'src.preprocessing' from '/Users/wpw555/Documents/Kaggle/playground_s5e5/src/preprocessing.py'>

In [87]:
# Set paths
DATA_PATH = '../data/raw/train.csv'
PREPROCESSOR_PATH = '../src/preprocessor.pkl'

# Load the data
data = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {data.shape}")
data.head()

Dataset shape: (750000, 9)


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [88]:
data.columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')

In [89]:
# Clean data
cleaned_data, caps = prep.clean_train_data(data)

X = cleaned_data.drop(columns=['Calories'])
X = prep.create_features(X)
y = cleaned_data['Calories']

# Apply log transformation
y = np.log1p(y)

   Sex   Age  Height  Weight  Duration  Heart_Rate  Body_Temp  Calories
0    0  20.0   150.0    51.0      14.0        94.0       40.1      67.0
1    0  20.0   150.0    51.0       1.0        90.0       37.8       5.0
2    0  20.0   150.0    51.0      26.0       101.0       40.8     151.0
3    0  20.0   150.0    52.0      18.0       106.0       40.1     111.0
4    0  20.0   150.0    51.0       1.0        79.0       37.7       4.0


In [90]:
# Define the objective function for optimization
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "random_state": 42,
        "objective": "reg:squarederror",
        "enable_categorical": True
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmsle_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        # Build and fit preprocessor
        preprocessor = prep.build_preprocessor(X_train_fold)
        X_train_fold, X_val_fold = prep.fit_and_transform_preprocessor(preprocessor, X_train_fold, X_val_fold)

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold)

        log_preds = model.predict(X_val_fold)
        preds = np.expm1(log_preds)
        preds = np.clip(preds, 0, None)

        y_val_original = np.expm1(y_val_fold)

        score = np.sqrt(mean_squared_log_error(y_val_original, preds))
        rmsle_scores.append(score)

    return np.mean(rmsle_scores)

In [91]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # try 20 different sets of hyperparameters

[I 2025-05-11 18:18:59,728] A new study created in memory with name: no-name-2ce29a54-106f-4a99-abf6-603bf944d0ad
[I 2025-05-11 18:19:29,275] Trial 0 finished with value: 0.06281651443553535 and parameters: {'learning_rate': 0.026248887009042387, 'max_depth': 5, 'n_estimators': 774, 'subsample': 0.5613003253209559, 'colsample_bytree': 0.516633691192101, 'gamma': 4.839353728428216}. Best is trial 0 with value: 0.06281651443553535.
[I 2025-05-11 18:19:58,172] Trial 1 finished with value: 0.060316312869442744 and parameters: {'learning_rate': 0.055310148523597835, 'max_depth': 8, 'n_estimators': 958, 'subsample': 0.8976199243661203, 'colsample_bytree': 0.8970844449290423, 'gamma': 2.8277293421782908}. Best is trial 1 with value: 0.060316312869442744.
[I 2025-05-11 18:20:13,803] Trial 2 finished with value: 0.06045493406218516 and parameters: {'learning_rate': 0.04795031614474571, 'max_depth': 5, 'n_estimators': 134, 'subsample': 0.6958263421652497, 'colsample_bytree': 0.92279071052324, 'g

In [92]:
print("Best trial:")
print("  RMSLE:", study.best_value)
print("  Params:", study.best_params)

Best trial:
  RMSLE: 0.058113290122172055
  Params: {'learning_rate': 0.0726557083093565, 'max_depth': 10, 'n_estimators': 832, 'subsample': 0.8107523161071091, 'colsample_bytree': 0.5572490823912128, 'gamma': 0.04707141251577207}


In [93]:
# Use full training set with best params
preprocessor = prep.build_preprocessor(X)
X_train = prep.fit_and_transform_full(preprocessor, X)

# Create and train the xgb model
best_model = xgb.XGBRegressor(**study.best_params, enable_categorical=True)
best_model.fit(X_train, y)

# Make predictions
y_train_pred = best_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred)

In [94]:
# Clip predictions to ensure they are non-negative
y_train_pred = np.clip(y_train_pred, 0, None)

In [95]:
# Evaluate the model
y_original = np.expm1(y)
train_rmsle = np.sqrt(mean_squared_log_error(y_original, y_train_pred))
train_r2 = r2_score(y_original, y_train_pred)

print(f"Training RMSLE: {train_rmsle:.4f}")
print(f"Training R²: {train_r2:.4f}")

Training RMSLE: 0.0522
Training R²: 0.9973


In [96]:
# Save the model
joblib.dump(best_model, '../output/models/XGBoost_model_optuna6.pkl')
print("Model saved successfully!")

Model saved successfully!


In [97]:
# Load the test data
test_data = pd.read_csv('../data/raw/test.csv')

# Clean the test data
cleaned_test_data = prep.clean_test_data(test_data)

# Split into features and target
X_test = cleaned_test_data.drop(columns=['id'])
X_test = prep.create_features(X_test)

# Transform the test features using the fitted preprocessor
X_test_processed = preprocessor.transform(X_test)

# 2. Make predictions using the fitted model
y_test_log_pred = best_model.predict(X_test_processed)
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred = np.clip(y_test_pred, 0, None)

In [98]:
submission = pd.DataFrame({
    'id': test_data['id'],
    'Calories': y_test_pred
})
submission.to_csv('../output/submissions/XGBoost_optuna6.csv', index=False)