# Step 0: Load and summarize dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [2]:
# Load pre-processed data csv files
raw_users = pd.read_csv("data/myfitnesspal/myfitnesspal_users.csv")
raw_foods = pd.read_csv("data/myfitnesspal/myfitnesspal_foods.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/myfitnesspal/myfitnesspal_users.csv'

In [None]:
# Summarize size of dataset
print("Dataset contains:")
print("   -", len(raw_users.user_id.unique()), "unique users with a total of", len(raw_users), "daily entries")
print("   -", len(raw_foods), "total food entries")
just_food = raw_foods.drop(columns=['user_id', 'date', 'meal_name', 'meal_idx'])
just_food['full_name'] = just_food[['food_name', 'brand', 'flavor']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
just_food = just_food.drop_duplicates()
print("   -", len(just_food), "unique food entries (different name, serving size and/or nutrients)")
print("   -", len(just_food['full_name'].unique()), "unique full food names (including brand and flavor)")
just_food[['full_name', 'food_name', 'brand', 'flavor']].to_csv("data/myfitnesspal/unique_foods.csv")

Dataset contains:
   - 9896 unique users with a total of 587187 daily entries
   - 6502747 total food entries
   - 2133574 unique food entries (different name, serving size and/or nutrients)
   - 644887 unique full food names (including brand and flavor)


# Step 1: Baseline unique foods from myfitnesspal
- Create a dataframe of foods with unique food_name, brand, and flavor which should have the exact same nutritional profile per amount of food
- Since nutritional values of these identical foods can still differ slightly after normalization, we take the most frequently entered quantity as the ground truth for the nutritional profile for that unique food
- Finally, we scale the nutritional profile of the ground truth to 100 calories, so that all food nutritional profiles are normalized to nutrients per 100 calories to make the clustering in the next step more effective

In [None]:
# Create a database of nutrients normalized per 100 calories
norm_food = pd.DataFrame({
    'full_name': just_food['full_name'],
    'calories': just_food['calories'],
    'carbs/100cal': 100 * just_food['carbs'] / just_food['calories'],
    'fat/100cal': 100 * just_food['fat'] / just_food['calories'],
    'protein/100cal': 100 * just_food['protein'] / just_food['calories'],
    'sodium/100cal': 100 * just_food['sodium'] / just_food['calories'],
    'sugar/100cal': 100 * just_food['sugar'] / just_food['calories']
})

# Create new dataframe with average nutrients across identical food names
avg_nuts = norm_food.groupby('full_name').mean().reset_index()


In [None]:
def standardize_nutrients(avg_nuts, grouped_norm_food, just_food, chunk_size=1000):
    nutrients = ['carbs/100cal', 'fat/100cal', 'protein/100cal', 'sodium/100cal', 'sugar/100cal']
    
    print("Precomputing value counts...")
    value_counts = {}
    for col in tqdm(['serving_size', 'food_name', 'brand', 'flavor'], desc="Columns"):
        def safe_mode(x):
            counts = x.value_counts()
            return counts.index[0] if not counts.empty else np.nan
        
        value_counts[col] = just_food.groupby('full_name')[col].apply(safe_mode)
    
    print("Adding precomputed values to avg_nuts...")
    for col, counts in value_counts.items():
        avg_nuts[col] = avg_nuts['full_name'].map(counts)
    
    print("Initializing columns for best nutrients...")
    for nut in nutrients:
        avg_nuts[f'best{nut}'] = np.nan
    
    def process_group(group, name, avg_row):
        if group[nutrients].nunique().eq(1).all():
            food_ref = group.iloc[0][nutrients]
        else:
            dists = ((group[nutrients] - avg_row[nutrients]) / avg_row[nutrients]).abs().sum(axis=1)
            food_ref = group.loc[dists.idxmin(), nutrients]
        
        calories = just_food[just_food['full_name'] == name]['calories'].value_counts().index[0]
        return food_ref * (calories / 100)
    
    print("Processing groups in chunks...")
    full_names = list(grouped_norm_food.groups.keys())
    
    for i in tqdm(range(0, len(full_names), chunk_size), desc="Chunks"):
        chunk = full_names[i:i+chunk_size]
        chunk_groups = {name: grouped_norm_food.get_group(name) for name in chunk}
        
        for name, group in chunk_groups.items():
            avg_row = avg_nuts[avg_nuts['full_name'] == name].iloc[0]
            best_nuts = process_group(group, name, avg_row)
            
            for nut in nutrients:
                avg_nuts.loc[avg_nuts['full_name'] == name, f'best{nut}'] = best_nuts[nut]
        
        # Clear memory
        del chunk_groups
    
    print("Finalizing best_nuts dataframe...")
    food_ref = avg_nuts.drop(columns=nutrients).dropna(how='any')
    food_ref = food_ref.rename(columns={
        'bestcarbs/100cal': 'carbs', 
        'bestfat/100cal': 'fat', 
        'bestprotein/100cal': 'protein', 
        'bestsodium/100cal': 'sodium',
        'bestsugar/100cal': 'sugar'
    })
    
    return food_ref


In [3]:
# Process and save standardized nutrient references
tqdm.pandas(desc="Processing rows")
food_ref = standardize_nutrients(avg_nuts, norm_food.groupby('full_name'), just_food)
food_ref.to_csv("data/nutrient_reference.csv")
food_ref.to_pickle("data/nutrient_reference.pkl")

NameError: name 'standardize_nutrients' is not defined

# Step 2: Mapping to USDA FNNDS catetories
- Food and Nutrient Database for Dietary Studies (FNDDS) matches NHANES data categorization and includes 5,624 food types: https://fdc.nal.usda.gov/fdc-app.html#/food-search?type=Survey%20(FNDDS)&query=

# Step 3: Code for pre-processing daily entries to match standardized food reference