In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as mae
import shap

import warnings
warnings.filterwarnings('ignore')

## Load the dataset and perform initial exploration to understand its structure and characteristics.

In [None]:
df = pd.read_csv('E:/data mining/project/dataset/calories.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check for missing values in each column
df.isnull().sum()

In [None]:
# User_ID is a unique identifier and does not contribute to the regression task
df=df.drop(columns='User_ID')
df.head()

## Data visualisation


In [None]:
#Scatterplot of height and weight
sns.scatterplot(x='Height', y='Weight', data=df)
plt.savefig('Height and Weight scatter_plots.png')
plt.show()

In [None]:
# Select 'Age', 'Height', 'Weight', 'Duration' as features to study the scatterplot distribution of Calories
features = ['Age', 'Height', 'Weight', 'Duration']

plt.subplots(figsize=(15, 10))
for i, col in enumerate(features):
    plt.subplot(2, 2, i + 1)
    x = df.sample(1000)
    sns.scatterplot(x=col, y='Calories', data=x)
    plt.title(f'Scatter plot of {col} vs Calories')
    plt.xlabel(col)
plt.savefig('scatter_plots with Calories.png')
plt.tight_layout()
plt.show()

In [None]:
# Plotting histograms containing kernel density estimation (KDE) curves
features = df.select_dtypes(include='float').columns

plt.subplots(figsize=(15, 10))
for i, col in enumerate(features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col],kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
plt.savefig('hist.png')
plt.tight_layout()
plt.show()

In [None]:
df['Gender']=df['Gender'].map({'male':0,'female':1})
df.head()

In [None]:
# correlation matrix heatmap
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.savefig('Correlation Heatmap.png')
plt.show()

In [None]:
# Divide the dataset
X = df.drop(['Calories'], axis=1)
y = df['Calories']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, X_scaled.shape, y.shape

In [None]:
# Normalizing the features for stable and fast training.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model building Hyperparameter tuning Cross-validation

In [None]:
# Target model
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Neural Network': MLPRegressor(max_iter=2000, random_state=42)
}

# Hyperparameter grids
param_grids = {
    'Ridge Regression': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'alpha': [0.1, 1.0, 10.0]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5], 'learning_rate': [0.01, 0.1, 0.3]},
    'Neural Network': {'hidden_layer_sizes': [(64, 32), (128, 64), (100,)], 'learning_rate_init': [0.001, 0.01]}
}

# Dictionary to store trained models for plotting
fitted_models = {}
results = {}
for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"{name} Best Parameters: {best_params}")
    else:
        best_model = model
        best_model.fit(X_train, y_train)
    # Store the fitted model
    fitted_models[name] = best_model
    
    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation scores
    cv_mse = -cross_val_score(best_model, X_train, X_train, cv=5, scoring='neg_mean_squared_error').mean()
    cv_r2 = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2').mean()
    
    results[name] = {'MSE': mse, 'RMSE': rmse, 'R²': r2, 'CV MSE': cv_mse, 'CV R²': cv_r2}

results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)

# Comparisons between different models and visualisation of feature significance

In [None]:
# Plot actual vs. predicted values for each model
for name, model in fitted_models.items():
    # Get predictions
    y_pred = model.predict(X_test)
    
    # Create scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.5, label='Actual vs. Predicted')
    
    # Add diagonal line (perfect prediction line: y=x)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction (y=x)')
    
    plt.xlabel('Actual Calories')
    plt.ylabel('Predicted Calories')
    plt.title(f'Actual vs. Predicted Calories: {name}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'figures/actual_vs_predicted_{name.lower().replace(" ", "_")}.png')
    plt.show()

Neural networks and random forests significantly outperform other models, linear class models work less well

In [None]:
# Visualize RMSE
results_df.index = results_df.index.astype(str)

plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df['RMSE'])
plt.title('Model Comparison: RMSE')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Root Mean Squared Error')
plt.savefig('Model_Comparison_RMSE.png', bbox_inches='tight')
plt.show()

XGBoost has a relatively high RMSE  
Possible problem:  
Learning rate is too high or too low: Improperly set learning_rate may cause gradient descent to fail to find an optimal solution to the loss function.For example, SGDRegressor defaults to a learning rate of 0.01, which may not be appropriate for your data.  
Insufficient iterations: max_iter (maximum number of iterations) is set too low, which may result in the model not converging.   
Data not properly normalised: gradient descent is sensitive to feature scale.If the features are not correctly normalised (despite StandardScaler being used in your code), this may lead to unstable optimisation.

In [None]:
# Visualize R²
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df['R²'])
plt.title('Model Comparison: R² Score')
plt.xticks(rotation=45)
plt.ylabel('R² Score')
plt.savefig('Model_Comparison_R² Score.png')   
plt.show()

Neural networks, Random forests, XGBoost, all have higher R^2

In [None]:
# Visualisation of feature importance
for name in ['Random Forest', 'XGBoost']:
    model = models[name]
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    feature_names = X.columns
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance: {name}')
    plt.savefig(f'Feature_Importance_{name}.png')  
    plt.show()

Duration and Heart_Rate contribute the most to the prediction.