# Step 0: Load and summarize dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [2]:
# Load pre-processed data csv files
raw_users = pd.read_csv("data/myfitnesspal/myfitnesspal_users.csv")
raw_foods = pd.read_csv("data/myfitnesspal/myfitnesspal_foods.csv")

In [4]:
# Summarize size of dataset
print("Dataset contains:")
print("   -", len(raw_users.user_id.unique()), "unique users with a total of", len(raw_users), "daily entries")
print("   -", len(raw_foods), "total food entries")
just_food = raw_foods.drop(columns=['user_id', 'date', 'meal_name', 'meal_idx'])
just_food['full_name'] = just_food[['food_name', 'brand', 'flavor']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
just_food = just_food.drop_duplicates()
print("   -", len(just_food), "unique food entries (different name, serving size and/or nutrients)")
print("   -", len(just_food['full_name'].unique()), "unique full food names (including brand and flavor)")
just_food[['full_name', 'food_name', 'brand', 'flavor']].to_csv("data/myfitnesspal/unique_foods.csv")

Dataset contains:
   - 9896 unique users with a total of 587187 daily entries
   - 6502747 total food entries


   - 2133574 unique food entries (different name, serving size and/or nutrients)
   - 644887 unique full food names (including brand and flavor)


# Step 1: Baseline unique foods from myfitnesspal
- Create a dataframe of foods with unique food_name, brand, and flavor which should have the exact same nutritional profile per amount of food
- Since nutritional values of these identical foods can still differ slightly after normalization, we take the most frequently entered quantity as the ground truth for the nutritional profile for that unique food
- Finally, we scale the nutritional profile of the ground truth to 100 calories, so that all food nutritional profiles are normalized to nutrients per 100 calories to make the clustering in the next step more effective

In [3]:
def standardize_nutrients(just_food, chunk_size=1000):
    # Create a database of nutrients normalized per 100 calories
    norm_food = pd.DataFrame({
        'full_name': just_food['full_name'],
        'calories': just_food['calories'],
        'carbs/100cal': 100 * just_food['carbs'] / just_food['calories'],
        'fat/100cal': 100 * just_food['fat'] / just_food['calories'],
        'protein/100cal': 100 * just_food['protein'] / just_food['calories'],
        'sodium/100cal': 100 * just_food['sodium'] / just_food['calories'],
        'sugar/100cal': 100 * just_food['sugar'] / just_food['calories']
    })

    # Create new dataframe with average nutrients across identical food names
    avg_nuts = norm_food.groupby('full_name').mean().reset_index()

    nutrients = ['carbs/100cal', 'fat/100cal', 'protein/100cal', 'sodium/100cal', 'sugar/100cal']
    
    print("Precomputing value counts...")
    value_counts = {}
    for col in tqdm(['serving_size', 'food_name', 'brand', 'flavor'], desc="Columns"):
        def safe_mode(x):
            counts = x.value_counts()
            return counts.index[0] if not counts.empty else np.nan
        
        value_counts[col] = just_food.groupby('full_name')[col].apply(safe_mode)
    
    print("Adding precomputed values to avg_nuts...")
    for col, counts in value_counts.items():
        avg_nuts[col] = avg_nuts['full_name'].map(counts)
    
    print("Initializing columns for best nutrients...")
    for nut in nutrients:
        avg_nuts[f'best{nut}'] = np.nan
    
    def process_group(group, name, avg_row):
        if group[nutrients].nunique().eq(1).all():
            food_ref = group.iloc[0][nutrients]
        else:
            dists = ((group[nutrients] - avg_row[nutrients]) / avg_row[nutrients]).abs().sum(axis=1)
            food_ref = group.loc[dists.idxmin(), nutrients]
        
        calories = just_food[just_food['full_name'] == name]['calories'].value_counts().index[0]
        return food_ref * (calories / 100)
    
    print("Processing groups in chunks...")
    full_names = list(norm_food.groupby('full_name').groups.keys())
    
    for i in tqdm(range(0, len(full_names), chunk_size), desc="Chunks"):
        chunk = full_names[i:i+chunk_size]
        chunk_groups = {name: norm_food.groupby('full_name').get_group(name) for name in chunk}
        
        for name, group in chunk_groups.items():
            avg_row = avg_nuts[avg_nuts['full_name'] == name].iloc[0]
            best_nuts = process_group(group, name, avg_row)
            
            for nut in nutrients:
                avg_nuts.loc[avg_nuts['full_name'] == name, f'best{nut}'] = best_nuts[nut]
        
        # Clear memory
        del chunk_groups
    
    print("Finalizing best_nuts dataframe...", end=" ")
    food_ref = avg_nuts.drop(columns=nutrients).dropna(how='any')
    # Rename columns
    food_ref = food_ref.rename(columns={
        'bestcarbs/100cal': 'carbs', 
        'bestfat/100cal': 'fat', 
        'bestprotein/100cal': 'protein', 
        'bestsodium/100cal': 'sodium',
        'bestsugar/100cal': 'sugar'
    })
    # Rearrange columns
    food_ref = food_ref[['full_name', 'food_name', 'brand', 'flavor', 'serving_size', 
                         'calories', 'carbs', 'fat', 'protein', 'sodium', 'sugar']]
    
    return food_ref


In [5]:
# Sort just_food by full_name to reduce redundancy when processing in batches
just_food = just_food.sort_values(by=['full_name'])

In [6]:
print(len(just_food))

2133574


In [6]:
# Process and save standardized nutrient references in batches
start=2000000; stop=len(just_food)
tqdm.pandas(desc="Processing rows")
food_ref = standardize_nutrients(just_food.iloc[start:stop])
food_ref.to_csv("data/ref/nutrient_reference"+str(start)+"-"+str(stop)+".csv")
food_ref.to_pickle("data/ref/nutrient_reference"+str(start)+"-"+str(stop)+".pkl")
print("Done.")

Precomputing value counts...


Columns:   0%|          | 0/4 [00:00<?, ?it/s]

Columns: 100%|██████████| 4/4 [00:10<00:00,  2.72s/it]


Adding precomputed values to avg_nuts...
Initializing columns for best nutrients...
Processing groups in chunks...


Chunks: 100%|██████████| 48/48 [42:36<00:00, 53.26s/it]

Finalizing best_nuts dataframe... Done.





# Step 2: Mapping to USDA FNNDS catetories
- Food and Nutrient Database for Dietary Studies (FNDDS) matches NHANES data categorization and includes 5,624 food types: https://fdc.nal.usda.gov/fdc-app.html#/food-search?type=Survey%20(FNDDS)&query=

In [2]:
data = pd.read_csv("data/FNNDS/branded_food.csv")
print(len(data))
display(data.head())

  data = pd.read_csv("data/FNNDS/branded_food.csv")


1958978


Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,,,,
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,,,,
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,,,,


In [12]:
print(data['branded_food_category'].unique())

['Oils Edible' 'Herbs/Spices/Extracts' 'Prepared Soups'
 'Sauces/Spreads/Dips/Condiments' 'Dough Based Products / Meals'
 'Vegetables  Prepared/Processed' 'Bread' 'Biscuits/Cookies'
 'Sweet Bakery Products' 'Savoury Bakery Products'
 'Non Alcoholic Beverages  Ready to Drink'
 'Meat/Poultry/Other Animals  Unprepared/Unprocessed'
 'Meat/Poultry/Other Animals  Prepared/Processed'
 'Fruit  Prepared/Processed' 'Cookies & Biscuits'
 'Frozen Fruit & Fruit Juice Concentrates'
 'Popcorn, Peanuts, Seeds & Related Snacks'
 'Croissants, Sweet Rolls, Muffins & Other Pastries'
 "Frozen Appetizers & Hors D'oeuvres" 'Wholesome Snacks'
 'Nut & Seed Butters' 'Chips, Pretzels & Snacks' 'Cheese' 'Rice'
 'Sausages, Hotdogs & Brats' 'Canned Fruit' 'Frozen Vegetables'
 'Crackers & Biscotti' 'Cooked & Prepared' 'Frozen Dinners & Entrees'
 'Other Frozen Desserts' 'Snack, Energy & Granola Bars'
 'Oriental, Mexican & Ethnic Sauces' 'Breads & Buns'
 'Seasoning Mixes, Salts, Marinades & Tenderizers'
 'Pastry Shell

In [3]:
data = pd.read_csv("data/FNNDS/food.csv")
print(len(data[data["data_type"]=='branded_food']))
display(data[data["data_type"]=='branded_food'].head())

1958978


Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,1105904,branded_food,WESSON Vegetable Oil 1 GAL,,2020-11-13
1,1105905,branded_food,SWANSON BROTH BEEF,,2020-11-13
2,1105906,branded_food,CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER,,2020-11-13
3,1105907,branded_food,CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI,,2020-11-13
9,1105908,branded_food,SWANSON BROTH CHICKEN,,2020-11-13


In [8]:
branded_food = data[data["data_type"]=='branded_food'].dropna(subset='description')
subset = branded_food[branded_food['description'].str.contains("Bar")]
print(len(subset))
display(subset.head())


3614


Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
56133,344652,branded_food,Kellogg's Cereal Breakfast Bar Chocolate 1.34oz,,2019-04-01
56411,344930,branded_food,Bear Naked Bars Double Chocolate 1.41oz,,2019-04-01
56433,344952,branded_food,Kashi Chewy Bars Cherry Dark Chocolate 1.2oz,,2019-04-01
56434,344953,branded_food,Kashi Chewy Bars Trail Mix 1.2oz,,2019-04-01
56435,344954,branded_food,Kashi Crunchy Bars Chocolate Chip 1.4oz,,2019-04-01


In [18]:
print(data['data_type'].unique())

['branded_food' 'experimental_food' 'sr_legacy_food' 'sample_food'
 'market_acquistion' 'sub_sample_food' 'foundation_food'
 'agricultural_acquisition' 'survey_fndds_food']


In [8]:
data = pd.read_csv("data/FNNDS/food_nutrient.csv")
print(len(data))
display(data.head())

  data = pd.read_csv("data/FNNDS/food_nutrient.csv")


26455322


Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,loq,footnote,min_year_acquired,percent_daily_value
0,13706927,1105904,1257,0.0,,71.0,,,,,,,
1,13706930,1105904,1293,53.33,,71.0,,,,,,,0.0
2,13706926,1105904,1253,0.0,,75.0,,,,,,,0.0
3,13706921,1105904,1092,0.0,,75.0,,,,,,,0.0
4,13706916,1105904,1008,867.0,,71.0,,,,,,,


In [15]:
data = pd.read_csv("data/FNNDS/survey_fndds_food.csv")
print(len(data['food_code'].unique()))
display(data.head())

5624


Unnamed: 0,fdc_id,food_code,wweia_category_code,start_date,end_date
0,2340760,11000000,9602,2019-01-01,2020-12-31
1,2340761,11100000,1004,2019-01-01,2020-12-31
2,2340762,11111000,1002,2019-01-01,2020-12-31
3,2340763,11112110,1004,2019-01-01,2020-12-31
4,2340764,11112210,1006,2019-01-01,2020-12-31


In [20]:
mfp = pd.read_csv("data/myfitnesspal/unique_foods.csv")
display(mfp.head())

Unnamed: 0.1,Unnamed: 0,full_name,food_name,brand,flavor
0,0,McDonalds Espresso Pronto® Flat White_my_nan,McDonalds Espresso Pronto® Flat White,my,
1,1,Banana Nut Muffin Natural Protein Bar_Quest Ba...,Banana Nut Muffin Natural Protein Bar,Quest Bar,
2,2,Vita Brits_Uncle Tobys Australia_nan,Vita Brits,Uncle Tobys Australia,
3,3,Smarter White Milk_Pauls_nan,Smarter White Milk,Pauls,
4,4,Cookies and Cream_Quest Bar_nan,Cookies and Cream,Quest Bar,


# Step 3: Code for pre-processing daily entries to match standardized food reference