In [None]:
# Import all required libraries for data analysis and modeling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import joblib  # For saving the trained model
import shap    # For model interpretation (though not used in this notebook)
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

In [None]:
# Load and merge the dataset
calories = pd.read_csv('/content/calories.csv')  # Load calories data
exercise_data = pd.read_csv('/content/exercise.csv')  # Load exercise data
# Combine exercise data with calories column
calories_data = pd.concat([exercise_data, calories['Calories']], axis=1)

In [None]:
# Data preprocessing and feature engineering

# Convert gender to numerical values (0 for male, 1 for female)
calories_data.replace({"Gender": {'male': 0, 'female': 1}}, inplace=True)
# Drop User_ID as it's not relevant for modeling
calories_data.drop(columns=['User_ID'], inplace=True)

# Calculate BMI (Body Mass Index)
# Convert height from cm to meters first
calories_data['Height_m'] = calories_data['Height'] / 100
# BMI formula: weight(kg) / height(m)^2
calories_data['BMI'] = calories_data['Weight'] / (calories_data['Height_m'] ** 2)
# Remove temporary height in meters column
calories_data.drop(columns=['Height_m'], inplace=True)

# Create age groups (categorical bins)
# Bins: 0-18 (0), 19-35 (1), 36-50 (2), 51-80 (3)
calories_data['Age_Group'] = pd.cut(calories_data['Age'], bins=[0, 18, 35, 50, 80],
                                  labels=[0, 1, 2, 3])

# Create interaction features that might help the model
# Age multiplied by exercise duration
calories_data['Age_x_Duration'] = calories_data['Age'] * calories_data['Duration']
# Weight multiplied by exercise duration
calories_data['Weight_x_Duration'] = calories_data['Weight'] * calories_data['Duration']
# Heart rate multiplied by body temperature
calories_data['HeartRate_x_BodyTemp'] = calories_data['Heart_Rate'] * calories_data['Body_Temp']

In [None]:
# Remove outliers using z-score (keep only data points within 3 standard deviations)
from scipy.stats import zscore
# Calculate z-scores for all numerical columns
z_scores = np.abs(zscore(calories_data.select_dtypes(include=[np.number])))
# Keep only rows where all z-scores are less than 3
calories_data = calories_data[(z_scores < 3).all(axis=1)]

In [None]:
# Prepare data for modeling

# Features (X) - all columns except Calories
X = calories_data.drop(columns=['Calories'])
# Target (Y) - Calories column
Y = calories_data['Calories']

# Convert age group categories to dummy variables (one-hot encoding)
# drop_first=True to avoid dummy variable trap
X = pd.get_dummies(X, columns=['Age_Group'], drop_first=True)

# Split data into training (80%) and testing (20%) sets
# random_state=2 ensures reproducible splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Create and evaluate machine learning pipeline

# Pipeline steps:
# 1. StandardScaler: Standardize features (mean=0, std=1)
# 2. RandomForestRegressor: Ensemble of decision trees for regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Perform 5-fold cross-validation to evaluate model performance
# Using R^2 score as the evaluation metric
cv_scores = cross_val_score(pipeline, X, Y, cv=5, scoring='r2')
print("Cross-Validation R^2 Scores:", cv_scores)
print("Average CV R^2 Score:", np.mean(cv_scores))

# Train the model on the full training set
pipeline.fit(X_train, Y_train)

In [None]:
# Evaluate model performance on test set

# Make predictions on test data
predictions = pipeline.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(Y_test, predictions)  # Mean Absolute Error
mse = mean_squared_error(Y_test, predictions)   # Mean Squared Error
rmse = np.sqrt(mse)                             # Root Mean Squared Error
r2 = r2_score(Y_test, predictions)              # R-squared score

print("Model Performance:")
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

In [None]:
# Visualize actual vs predicted values

plt.figure(figsize=(8,6))
# Scatter plot of actual vs predicted calories
plt.scatter(Y_test, predictions, alpha=0.5)
plt.xlabel("Actual Calories")
plt.ylabel("Predicted Calories")
plt.title("Actual vs Predicted Calories")
plt.grid(True)
plt.show()

In [None]:
# Save the trained model for future use
joblib.dump(pipeline, "calories_predictor_model.pkl")