# Calories Prediction - XGBoost Model

This notebook implements an XGBoost model with hyperparameter tuning to predict calories burned during workouts for the Kaggle Playground Series competition.


## 1. Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)


## 2. Load and Explore Data


In [None]:
# Load the training and test data
train_data = pd.read_csv('playground-series-s5e5/train.csv')
test_data = pd.read_csv('playground-series-s5e5/test.csv')

# Display basic information about the training data
print("Training data shape:", train_data.shape)
train_data.head()


In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_data.isnull().sum())

print("\nMissing values in test data:")
print(test_data.isnull().sum())


In [None]:
# Statistical summary of the training data
train_data.describe()


## 3. Exploratory Data Analysis


In [None]:
# Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Calories'], kde=True)
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

# Check if log transformation would make the distribution more normal
plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(train_data['Calories']), kde=True)
plt.title('Distribution of Log(Calories)')
plt.xlabel('Log(Calories)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
correlation = train_data[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Relationship between features and target
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
for i, feature in enumerate(features):
    sns.scatterplot(x=feature, y='Calories', data=train_data, ax=axes[i])
    axes[i].set_title(f'{feature} vs Calories')

plt.tight_layout()
plt.show()


## 4. Feature Engineering


In [None]:
# Create a copy of the datasets for feature engineering
train_fe = train_data.copy()
test_fe = test_data.copy()

# Calculate BMI (Body Mass Index)
train_fe['BMI'] = train_fe['Weight'] / ((train_fe['Height'] / 100) ** 2)
test_fe['BMI'] = test_fe['Weight'] / ((test_fe['Height'] / 100) ** 2)

# Create interaction features
train_fe['Duration_HeartRate'] = train_fe['Duration'] * train_fe['Heart_Rate']
test_fe['Duration_HeartRate'] = test_fe['Duration'] * test_fe['Heart_Rate']

train_fe['Weight_Duration'] = train_fe['Weight'] * train_fe['Duration']
test_fe['Weight_Duration'] = test_fe['Weight'] * test_fe['Duration']

# Add Body_Temp squared term to model curvature
body_temp_mean = train_fe['Body_Temp'].mean()
train_fe['Body_Temp_Squared'] = (train_fe['Body_Temp'] - body_temp_mean) ** 2
test_fe['Body_Temp_Squared'] = (test_fe['Body_Temp'] - body_temp_mean) ** 2

# Create age buckets to capture non-linear age effects
# Define age bins
age_bins = [0, 25, 35, 45, 55, 100]
age_labels = ['<25', '25-35', '35-45', '45-55', '55+']

# Create age buckets
train_fe['Age_Bucket'] = pd.cut(train_fe['Age'], bins=age_bins, labels=age_labels)
test_fe['Age_Bucket'] = pd.cut(test_fe['Age'], bins=age_bins, labels=age_labels)

# One-hot encode age buckets
age_dummies_train = pd.get_dummies(train_fe['Age_Bucket'], prefix='Age')
age_dummies_test = pd.get_dummies(test_fe['Age_Bucket'], prefix='Age')

# Add age dummy columns to the dataframes
train_fe = pd.concat([train_fe, age_dummies_train], axis=1)
test_fe = pd.concat([test_fe, age_dummies_test], axis=1)

# Convert Sex to numerical (0 for female, 1 for male)
train_fe['Sex_num'] = train_fe['Sex'].map({'female': 0, 'male': 1})
test_fe['Sex_num'] = test_fe['Sex'].map({'female': 0, 'male': 1})

# Display the new features
train_fe.head()


## 5. Prepare Data for Modeling


In [None]:
# Define features and target
features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Sex_num', 'BMI', 
           'Duration_HeartRate', 'Weight_Duration', 'Body_Temp_Squared',
           'Age_<25', 'Age_25-35', 'Age_35-45', 'Age_45-55', 'Age_55+']
X = train_fe[features]
y = train_fe['Calories']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


## 6. Train a Basic XGBoost Model


In [None]:
# Define a function to calculate RMSLE
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Create a custom RMSLE evaluation metric for XGBoost
def rmsle_xgb(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred = np.maximum(y_pred, 0)  # Ensure predictions are positive
    return 'RMSLE', np.sqrt(mean_squared_log_error(y_true, y_pred))

# Train a basic XGBoost model
basic_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Fit the model
basic_xgb.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = basic_xgb.predict(X_val)

# Ensure predictions are positive (required for log calculation)
y_pred = np.maximum(y_pred, 0)

# Calculate metrics
rmsle_score = rmsle(y_val, y_pred)
rmse_score = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"Basic XGBoost Model Performance:")
print(f"RMSLE: {rmsle_score:.4f}")
print(f"RMSE: {rmse_score:.4f}")
print(f"R² Score: {r2:.4f}")


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': basic_xgb.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()


## 7. Hyperparameter Tuning with RandomizedSearchCV


In [None]:
# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

# Create a custom scorer for RMSLE
from sklearn.metrics import make_scorer
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# Initialize RandomizedSearchCV
xgb_random = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter settings sampled
    scoring=rmsle_scorer,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the randomized search
xgb_random.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:")
print(xgb_random.best_params_)
print(f"Best RMSLE: {-xgb_random.best_score_:.4f}")


## 8. Train the Optimized XGBoost Model


In [None]:
# Train the model with the best parameters
best_xgb_model = xgb.XGBRegressor(**xgb_random.best_params_, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_best = best_xgb_model.predict(X_val)

# Ensure predictions are positive
y_pred_best = np.maximum(y_pred_best, 0)

# Calculate metrics
rmsle_score_best = rmsle(y_val, y_pred_best)
rmse_score_best = np.sqrt(mean_squared_error(y_val, y_pred_best))
r2_best = r2_score(y_val, y_pred_best)

print(f"Optimized XGBoost Model Performance:")
print(f"RMSLE: {rmsle_score_best:.4f}")
print(f"RMSE: {rmse_score_best:.4f}")
print(f"R² Score: {r2_best:.4f}")

# Compare with the basic model
print("\nImprovement over basic model:")
print(f"RMSLE improvement: {rmsle_score - rmsle_score_best:.4f} ({(rmsle_score - rmsle_score_best) / rmsle_score * 100:.2f}%)")


In [None]:
# Feature importance of the optimized model
feature_importance_best = pd.DataFrame({
    'Feature': features,
    'Importance': best_xgb_model.feature_importances_
})
feature_importance_best = feature_importance_best.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_best)
plt.title('Feature Importance (Optimized Model)')
plt.tight_layout()
plt.show()


## 9. Visualize Predictions vs Actual Values


In [None]:
# Create a dataframe with actual and predicted values
results_df = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_pred_best
})

# Plot actual vs predicted values
plt.figure(figsize=(10, 8))
plt.scatter(results_df['Actual'], results_df['Predicted'], alpha=0.5)
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'r--')
plt.xlabel('Actual Calories')
plt.ylabel('Predicted Calories')
plt.title('Actual vs Predicted Calories')
plt.show()

# Plot residuals
results_df['Residuals'] = results_df['Actual'] - results_df['Predicted']
plt.figure(figsize=(10, 6))
sns.histplot(results_df['Residuals'], kde=True)
plt.axvline(x=0, color='r', linestyle='--')
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.show()


## 10. Make Predictions on Test Data


In [None]:
# Prepare test data with the same features
X_test = test_fe[features]

# Make predictions on the test set
test_predictions = best_xgb_model.predict(X_test)

# Ensure predictions are positive
test_predictions = np.maximum(test_predictions, 0)

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_data['id'],
    'Calories': test_predictions
})

# Display the first few rows of the submission file
submission.head()


In [None]:
# Save the submission file
submission.to_csv('xgboost_submission.csv', index=False)
print("Submission file saved successfully!")


## 11. Conclusion

In this notebook, we built an XGBoost model to predict calories burned during workouts. We performed the following steps:

1. Loaded and explored the dataset
2. Conducted exploratory data analysis to understand the relationships between features and the target variable
3. Performed feature engineering to create new features that might improve model performance
4. Trained a basic XGBoost model as a baseline
5. Used RandomizedSearchCV to tune the hyperparameters of the model
6. Trained an optimized XGBoost model with the best hyperparameters
7. Evaluated the model's performance using RMSLE (Root Mean Squared Logarithmic Error)
8. Generated predictions for the test set and created a submission file

The optimized XGBoost model showed good performance on the validation set, with a significant improvement over the baseline model. The most important features for predicting calories burned were identified through the model's built-in feature importance.