<a href="https://colab.research.google.com/github/9mithun9/Calorie-Prediction-Model/blob/main/Calorie_Prediction_Model_ensemble_(V2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installation and Dataset Download**


*   install necessary libraries
*   downloaded data from a kaggle competition



In [4]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [5]:
import opendatasets as od

In [6]:
dataset_url = 'https://www.kaggle.com/competitions/playground-series-s5e5/data'

In [7]:
od.download(dataset_url)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: mehedferewrfs
Your Kaggle Key: ··········
Extracting archive ./playground-series-s5e5/playground-series-s5e5.zip to ./playground-series-s5e5


In [8]:
data_dir = 'playground-series-s5e5'

In [9]:
import pandas as pd

In [10]:
train_df = pd.read_csv(data_dir+'/train.csv')
test_df = pd.read_csv(data_dir+'/test.csv')
submission_df = pd.read_csv(data_dir+'/sample_submission.csv')

**Feature Engineering**

*   made male and female binary
*   added BMI column with weight and height
*   added popular calories prediction formula associated with heart_rate, age and duration



In [11]:
train_df['Sex'] = train_df.Sex.map({'male':1, 'female':0})
test_df['Sex'] = test_df.Sex.map({'male':1, 'female':0})

In [12]:
train_df['BMI'] = train_df['Weight']/((train_df['Height']/100)**2)
test_df['BMI'] = test_df['Weight']/((test_df['Height']/100)**2)

In [13]:
import numpy as np

In [14]:
train_df['ACSM'] = np.where(
    train_df['Sex'] == 1,
    ((-55.0969 + (0.6309 * train_df['Heart_Rate']) + (0.1988 * train_df['Weight']) + (0.2017 * train_df['Age'])) / 4.184) * train_df['Duration'],
    ((-20.4022 + (0.4472 * train_df['Heart_Rate']) - (0.1263 * train_df['Weight']) + (0.074 * train_df['Age'])) / 4.184) * train_df['Duration']

)

In [15]:
test_df['ACSM'] = np.where(
    test_df['Sex'] == 1,
    ((-55.0969 + (0.6309 * test_df['Heart_Rate']) + (0.1988 * test_df['Weight']) + (0.2017 * test_df['Age'])) / 4.184) * test_df['Duration'],
    ((-20.4022 + (0.4472 * test_df['Heart_Rate']) - (0.1263 * test_df['Weight']) + (0.074 * test_df['Age'])) / 4.184) * test_df['Duration']

)

In [16]:
train_df.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,BMI,ACSM
0,0,1,36,189.0,82.0,26.0,101.0,41.0,150.0,22.955684,200.013576
1,1,0,64,163.0,60.0,8.0,85.0,39.7,34.0,22.582709,28.236711
2,2,0,51,161.0,64.0,7.0,84.0,39.8,29.0,24.690405,21.504254
3,3,1,20,192.0,90.0,25.0,105.0,40.7,140.0,24.414062,197.619503
4,4,0,38,166.0,61.0,25.0,102.0,40.6,146.0,22.13674,121.414316


**Spliting Data**


*   Separated 75% data for training
*   25% data for validation_df



In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_val, y_train, y_val = train_test_split(train_df, train_df['Calories'], test_size=0.25, random_state=42)

**Columns**

In [19]:
numerical_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'BMI', 'ACSM']
target_cols = 'Calories'

In [20]:
train_inputs = x_train[numerical_cols]
train_targets = x_train[target_cols]
val_inputs = x_val[numerical_cols]
val_targets = x_val[target_cols]

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer

**Log Error Function**


*   The errors are calculated with RMSLE as per competition rules



In [22]:
from sklearn.metrics import mean_squared_log_error
import numpy as np

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [23]:
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [24]:
def rmsle_2(y_true, y_pred):
    # Ensure no negative or zero values (log can't handle them)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Clip predictions to avoid log(0)
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    y_true = np.clip(y_true, a_min=0, a_max=None)

    # Compute log-transformed values
    log_true = np.log1p(y_true)  # log1p(x) = log(1 + x)
    log_pred = np.log1p(y_pred)

    # Compute RMSLE
    squared_log_error = (log_true - log_pred)
    mean_squared_log_error = np.mean(squared_log_error)
    rmsle_value = np.sqrt(mean_squared_log_error)

    return rmsle_value

In [25]:
def predict_and_submit(model, fname, test_inputs):
  test_preds = model.predict(test_inputs)
  submission_df = pd.DataFrame({
    'id': test_df['id'],
    'calories': test_preds
  })
  submission_df.to_csv(fname, index=False)
  return submission_df

In [32]:
def evaluate(model):
  train_preds = model.predict(train_inputs)
  val_preds = model.predict(val_inputs)
  train_rmse = rmsle(train_targets, train_preds)
  val_rmse = rmsle(val_targets, val_preds)
  print(f'Train RMSLE: {train_rmse:.5f}, Val RMSLE: {val_rmse:.5f}')
  return train_preds, val_preds, train_rmse, val_rmse

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
rf_model = RandomForestRegressor(max_depth=17, min_samples_split=15, n_estimators=400,
                      n_jobs=-1, random_state=42)
rf_model.fit(train_inputs, train_targets)
train_preds_rf, val_preds_rf, train_rmsle_rf, val_rmsle_rf = evaluate(rf_model)

Train RMSLE: 0.04911, Val RMSLE: 0.06160


In [29]:
!pip install xgboost



In [34]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, n_estimators=700, learning_rate=0.08, max_depth=6)
xgb_model.fit(train_inputs, train_targets)

train_preds_xgb, val_preds_xgb, train_rmsle_xgb, val_rmsle_xgb = evaluate(xgb_model)

Train RMSLE: 0.05917, Val RMSLE: 0.06298


In [39]:
import pandas as pd
# Ensemble prediction with weights
weight_rf = 0.6
weight_xgb = 0.4

ensemble_val_preds = (weight_rf * val_preds_rf) + (weight_xgb * val_preds_xgb)
ensemble_val_rmsle = rmsle(val_targets, ensemble_val_preds)

print(f"Ensemble Validation RMSLE: {ensemble_val_rmsle:.5f}")

# Predict on test data using the ensemble
test_inputs = test_df[numerical_cols]
ensemble_test_preds = (weight_rf * rf_model.predict(test_inputs)) + (weight_xgb * xgb_model.predict(test_inputs))

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'calories': ensemble_test_preds
  })

Ensemble Validation RMSLE: 0.06099


In [40]:
# prompt: conevert the submission_df in to a csv file name submission_ensemble

submission_df.to_csv('submission_ensemble.csv', index=False)
