In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
# Load your data
df = pd.read_csv('sri_lankan_nutrition_dataset_1000.csv')  # Replace with your file path

In [4]:
# Ensure correct data types
df['Height(cm)'] = pd.to_numeric(df['Height(cm)'], errors='coerce')
df['Weight(kg)'] = pd.to_numeric(df['Weight(kg)'], errors='coerce')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Dietary_Goals(kg)'] = pd.to_numeric(df['Dietary_Goals(kg)'], errors='coerce')
df['Physical_Activity_Level'] = pd.to_numeric(df['Physical_Activity_Level'], errors='coerce')

print("Dataset shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())

Dataset shape: (1000, 12)

Missing values:
 District                           0
Age                                0
Gender                             0
Height(cm)                         0
Weight(kg)                         0
Physical_Activity_Level            0
Medical_Conditions               634
Allergies_Intolerances           691
Dietary_Goals(kg)                  0
Restrictions                     624
Cultural_Seasonal_Preferences      0
Cuisine                            0
dtype: int64


In [5]:
# 1. CALCULATE REQUIRED PARAMETERS
def calculate_bmr(weight_kg, height_cm, age, gender):
    """Mifflin-St Jeor Equation"""
    if gender.lower() == 'male':
        return 10 * weight_kg + 6.25 * height_cm - 5 * age + 5
    elif gender.lower() == 'female':
        return 10 * weight_kg + 6.25 * height_cm - 5 * age - 161
    else:  # Other
        return 10 * weight_kg + 6.25 * height_cm - 5 * age - 128  # Average adjustment

In [6]:
# BMI Calculation
df['Height_m'] = df['Height(cm)'] / 100
df['BMI'] = df['Weight(kg)'] / (df['Height_m'] ** 2)

In [7]:
# BMR and TEE Calculation
df['BMR'] = df.apply(lambda row: calculate_bmr(row['Weight(kg)'], 
                                               row['Height(cm)'], 
                                               row['Age'], 
                                               row['Gender']), axis=1)
df['TEE'] = df['BMR'] * df['Physical_Activity_Level']

In [8]:
# Weight Change Goal (positive = gain, negative = loss)
df['Weight_Goal_Change_kg'] = df['Dietary_Goals(kg)'] - df['Weight(kg)']

In [9]:
# BMI Category
def bmi_category(bmi):
    if bmi < 18.5: return 'Underweight'
    elif bmi < 25: return 'Normal'
    elif bmi < 30: return 'Overweight'
    else: return 'Obese'

df['BMI_Category'] = df['BMI'].apply(bmi_category)

In [10]:
# Daily Vegetable Servings Target (based on guidelines)
df['Veg_Servings_Target'] = np.where(df['BMI_Category'] == 'Normal', 5,
                                   np.where(df['BMI_Category'] == 'Overweight', 6, 4))

print("\nNew calculated features:")
print(df[['BMI', 'BMR', 'TEE', 'Weight_Goal_Change_kg', 'BMI_Category']].head())


New calculated features:
         BMI       BMR          TEE  Weight_Goal_Change_kg BMI_Category
0  24.847825  1658.000  2860.050000                   -7.5       Normal
1  24.095300  1124.750  1349.700000                   -8.4       Normal
2  22.866137  1166.375  1399.650000                   -6.5       Normal
3  22.519777  1517.625  2617.903125                    2.4       Normal
4  25.520030  1460.375  2774.712500                   -7.7   Overweight


In [11]:

# 2. DATA CLEANING
df_clean = df.dropna(subset=['Height(cm)', 'Weight(kg)', 'Age', 'Gender'])
df_clean = df_clean[(df_clean['Age'] >= 18) & (df_clean['Age'] <= 80)]
df_clean = df_clean[(df_clean['Weight(kg)'] > 30) & (df_clean['Weight(kg)'] < 150)]
df_clean = df_clean[df_clean['Height(cm)'].between(140, 210)]

print(f"\nClean dataset shape: {df_clean.shape}")


Clean dataset shape: (999, 19)


In [12]:
# 3. FEATURE ENCODING PREPARATION
# Numerical features for scaling
num_features = ['Age', 'Height(cm)', 'Weight(kg)', 'BMI', 'BMR', 'TEE', 
                'Physical_Activity_Level', 'Dietary_Goals(kg)', 'Weight_Goal_Change_kg', 'Veg_Servings_Target']

# Categorical features for encoding
cat_features = ['Gender', 'Medical_Conditions', 'Allergies_Intolerances', 'Restrictions', 'Cuisine', 'BMI_Category']

# Text features to process separately (preferences)
text_features = ['Cultural_Seasonal_Preferences', 'District']

# Save enhanced dataset
df_clean.to_csv('user_profiles_enhanced.csv', index=False)

print("\nPreprocessing complete! Key calculated parameters:")
print("- BMI: Body Mass Index")
print("- BMR: Basal Metabolic Rate (kcal/day)")
print("- TEE: Total Energy Expenditure (kcal/day)")
print("- Weight_Goal_Change_kg: Target weight difference")
print("- BMI_Category: Weight status classification")
print("- Veg_Servings_Target: Daily vegetable servings recommendation")

print(f"\nReady for USDA vegetable matching. Dataset saved as 'user_profiles_enhanced.csv'")


Preprocessing complete! Key calculated parameters:
- BMI: Body Mass Index
- BMR: Basal Metabolic Rate (kcal/day)
- TEE: Total Energy Expenditure (kcal/day)
- Weight_Goal_Change_kg: Target weight difference
- BMI_Category: Weight status classification
- Veg_Servings_Target: Daily vegetable servings recommendation

Ready for USDA vegetable matching. Dataset saved as 'user_profiles_enhanced.csv'
