In [1]:
try:
    from google.colab import drive

    !gdown "1Mgb_IhQHroQDuG6lQ039F6NF8Yeuy6QN"
    
    nutrition_json = './nutrition_data.json'
except ImportError:
    nutrition_json = '../data/nutrition_data.json'

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
df_nutrition = pd.read_json(nutrition_json)
df_nutrition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181440 entries, 0 to 181439
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Age                      181440 non-null  int64 
 1   Weight                   181440 non-null  int64 
 2   Gender                   181440 non-null  object
 3   Height                   181440 non-null  int64 
 4   Activity_Level           181440 non-null  object
 5   Goal                     181440 non-null  object
 6   Estimated_Calories       181440 non-null  int64 
 7   Estimated_Carbohydrates  181440 non-null  int64 
 8   Estimated_Protein_Min    181440 non-null  int64 
 9   Estimated_Protein_Max    181440 non-null  int64 
 10  Estimated_Fat            181440 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 15.2+ MB


# EDA

In [5]:
print(
    '\n\n'.join(
        i + '\t' + str(df_nutrition[i].unique()[:10]) for i in df_nutrition.columns
    )
)

Age	[18 19 20 21 22 23 24 25 26 27]

Weight	[45 46 47 48 49 50 51 52 53 54]

Gender	['m' 'f']

Height	[150 151 152 153 154 155 156 157 158 159]

Activity_Level	['Sedentary' 'Light' 'Moderate' 'Active' 'Very Active' 'Extra Active']

Goal	['Maintain Weight' 'Mild Weight Loss' 'Weight Loss' 'Extreme Weight Loss']

Estimated_Calories	[1572 1414 1241  927 1801 1621 1423 1062 1919 1727]

Estimated_Carbohydrates	[235 269 287 304 338 373 236 270 288 305]

Estimated_Protein_Min	[39 45 47 50 56 62 48 51 57 63]

Estimated_Protein_Max	[137 157 167 177 197 217 138 158 168 178]

Estimated_Fat	[26 23 20 15 30 27 17 31 28 25]


# Feature Engineering

In [7]:
df_nutrition['Activity_Level'].replace({
        'Sedentary|Light': 'Beginner',
        'Moderate|Active': 'Intermediate',
        'Very Active|Extra Active': 'Expert'
    },
    regex=True,
    inplace=True
)

df_nutrition.head()

Unnamed: 0,Age,Weight,Gender,Height,Activity_Level,Goal,Estimated_Calories,Estimated_Carbohydrates,Estimated_Protein_Min,Estimated_Protein_Max,Estimated_Fat
0,18,45,m,150,Beginner,Maintain Weight,1572,235,39,137,26
1,18,45,m,150,Beginner,Mild Weight Loss,1414,235,39,137,23
2,18,45,m,150,Beginner,Weight Loss,1241,235,39,137,20
3,18,45,m,150,Beginner,Extreme Weight Loss,927,235,39,137,15
4,18,45,m,150,Beginner,Maintain Weight,1801,269,45,157,30


# Split data into features and target variable

In [65]:
FEATURES = ['Age', 'Weight' , 'Height', 'Activity_Level', 'Goal']
TARGET = ['Estimated_Calories']

X = df_nutrition[FEATURES]
y = df_nutrition[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Preprocessing for categorical variables

In [67]:
categorical_cols = ['Activity_Level', 'Goal']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


# Combine preprocessing steps


In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create pipelines for both models with preprocessing

In [69]:
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train and evaluate Linear Regression model

In [70]:
linear_pipeline.fit(X_train, y_train)
linear_y_pred = linear_pipeline.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_y_pred)
print(f"Linear Regression Mean Squared Error: {linear_mse}")

Linear Regression Mean Squared Error: 25661.96853585435



# Train and evaluate Random Forest Regressor model

In [71]:
forest_pipeline.fit(X_train, y_train)
forest_y_pred = forest_pipeline.predict(X_test)
forest_mse = mean_squared_error(y_test, forest_y_pred)
print(f"Random Forest Regressor Mean Squared Error: {forest_mse}")

Random Forest Regressor Mean Squared Error: 23362.40246864075
