In [1]:
import random
import shutil
from collections import Counter, defaultdict
from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
import pandas as pd

## Analyze common ingredients from recipe5k for each 101 categories

In [2]:
cleaned_metadata = pd.read_csv(
    "../final-dataset/metadata/recipes5k_metadata.csv", sep="\t"
)

In [3]:
all_categories = cleaned_metadata["Category"].unique()

In [4]:
category_ingredients_tally = defaultdict()
for category in all_categories:
    all_rows = cleaned_metadata[cleaned_metadata["Category"] == category]
    ingredients_tally = Counter()
    average_ingredients = 0
    for ingredients in all_rows["Ingredients"]:
        ingredients_list = ingredients.split(",")
        average_ingredients += len(ingredients_list)
        ingredients_tally.update(ingredients_list)
    average_ingredients = average_ingredients // len(all_rows)
    print(f"Category : {category}, Average Ingredients : {average_ingredients}")
    category_ingredients_tally[category] = [average_ingredients, ingredients_tally]

Category : apple_pie, Average Ingredients : 8
Category : croque_madame, Average Ingredients : 10
Category : paella, Average Ingredients : 16
Category : gyoza, Average Ingredients : 12
Category : crab_cakes, Average Ingredients : 11
Category : carrot_cake, Average Ingredients : 13
Category : chicken_curry, Average Ingredients : 12
Category : steak, Average Ingredients : 6
Category : cannoli, Average Ingredients : 10
Category : ceviche, Average Ingredients : 10
Category : shrimp_and_grits, Average Ingredients : 13
Category : hummus, Average Ingredients : 8
Category : hot_and_sour_soup, Average Ingredients : 15
Category : nachos, Average Ingredients : 11
Category : creme_brulee, Average Ingredients : 4
Category : hamburger, Average Ingredients : 9
Category : beef_tartare, Average Ingredients : 12
Category : fish_and_chips, Average Ingredients : 10
Category : falafel, Average Ingredients : 13
Category : beet_salad, Average Ingredients : 8
Category : samosa, Average Ingredients : 14
Categor

In [60]:
# Manually fixed erros below
# Food 101 directory name : tacos , Recipes5k directory name : beef_tacos
# Food 101 directory name : cup_cakes, Recipes5k directory name : cupcakes
# Food 101 directory name : ice_cream, Recipes5k directory name : chocolate_ice_cream
# Correct the directory name of Food101 to ensure smooth integration

## Build food101 dataset 

In [5]:
ingredients_nutrition_table = pd.read_csv(
    "../final-dataset/metadata/recipes5k_ingredients_nutrition_table.csv", sep="\t"
)

In [6]:
food101_images_dir = Path("./images")
categories_dir = [x for x in food101_images_dir.iterdir()]

In [7]:
def move_image(src_path):
    category = src_path.parent.name
    parent_dest_dir = Path("../final-dataset/images")
    dir_name = category
    # check if dir containing file exists in destination
    dest_dir = parent_dest_dir / dir_name
    if not dest_dir.exists():
        dest_dir.mkdir()
    file_name = "f101_" + src_path.name
    dest_path = dest_dir / file_name
    try :
        assert dest_path.exists() == False, "{path} already exists".format(
            path=dest_path.as_posix()
        )
        shutil.copy(src_path, dest_path)
    except :
        pass
    return (dir_name, file_name)


def get_ingredients(
    category, category_ingredients_tally=category_ingredients_tally, top_n_percent=0.8
):
    average_ingredients_per_recipe, ingredients_counter = category_ingredients_tally[
        category
    ]
    ingredients_counter = ingredients_counter.most_common()
    top_n = int(average_ingredients_per_recipe * top_n_percent)
    random_n = average_ingredients_per_recipe - top_n
    top_n_ingredients = ingredients_counter[:top_n]
    random_n_ingredients = random.choices(ingredients_counter[top_n:], k=random_n)
    top_n_ingredients.extend(random_n_ingredients)
    ingredients_list = [a for a, b in top_n_ingredients]
    return ingredients_list


def get_dish_nutrition(ingredients_list, nutrition_table=ingredients_nutrition_table):
    total_carbs = 0
    total_protein = 0
    total_fat = 0
    total_calorie = 0
    total_ingredient = 0
    for ingredient in ingredients_list:
        row_in_nutrition_table = nutrition_table[
            nutrition_table["Ingredient"] == ingredient
        ]
        if len(row_in_nutrition_table) == 0:
            continue
        total_ingredient += 1
        total_calorie += row_in_nutrition_table["Calorie (kcal)"].values[0]
        total_carbs += row_in_nutrition_table["Carbs (g)"].values[0]
        total_protein += row_in_nutrition_table["Protein (g)"].values[0]
        total_fat += row_in_nutrition_table["Fat (g)"].values[0]
    return [
        total_calorie / total_ingredient,
        total_carbs / total_ingredient,
        total_protein / total_ingredient,
        total_fat / total_ingredient,
        ",".join(ingredients_list),
    ]


def preprocess(all_categories_dir):
    dataframes_each_category = []
    for category_dir in all_categories_dir:
        rows = []
        random.seed(999)  # for reproducibility
        for file in category_dir.iterdir():
            category, filename = move_image(file)
            ingredients_list = get_ingredients(category)
            nutrition = get_dish_nutrition(ingredients_list)
            row = [filename, category]
            row.extend(nutrition)
            rows.append(row)
        dataframes_each_category.append(
            pd.DataFrame(
                rows,
                columns=[
                    "ID/File Name",
                    "Category",
                    "Calorie(kcal)",
                    "Carbohydrate(g)",
                    "Protein(g)",
                    "Fat(g)",
                    "Ingredients",
                ],
            )
        )
    return pd.concat(dataframes_each_category)

In [12]:
metadata = preprocess(categories_dir)
display(metadata)

Unnamed: 0,ID/File Name,Category,Calorie(kcal),Carbohydrate(g),Protein(g),Fat(g),Ingredients
0,f101_1005649.jpg,apple_pie,2.794333,0.318833,0.039667,0.154500,"apple,cinnamon,flour,sugar,butter,salt,puff sh..."
1,f101_1011328.jpg,apple_pie,3.042200,0.380200,0.022600,0.164400,"apple,cinnamon,flour,sugar,butter,salt,arrowro..."
2,f101_101251.jpg,apple_pie,2.230143,0.284857,0.019000,0.117714,"apple,cinnamon,flour,sugar,butter,salt,pepper,..."
3,f101_1014775.jpg,apple_pie,3.161833,0.438500,0.038833,0.148167,"apple,cinnamon,flour,sugar,butter,salt,cornfla..."
4,f101_1026328.jpg,apple_pie,2.968500,0.425167,0.019000,0.137167,"apple,cinnamon,flour,sugar,butter,salt,puff sh..."
...,...,...,...,...,...,...,...
995,f101_981485.jpg,waffles,2.165750,0.245375,0.034625,0.118375,"egg,flour,salt,sugar,butter,milk,berries,vinegar"
996,f101_98238.jpg,waffles,2.668250,0.247000,0.065875,0.159625,"egg,flour,salt,sugar,butter,milk,berries,cheese"
997,f101_982668.jpg,waffles,3.665000,0.327000,0.042500,0.245250,"egg,flour,salt,sugar,butter,milk,oil,cornmeal"
998,f101_995085.jpg,waffles,2.723714,0.260429,0.038571,0.134857,"egg,flour,salt,sugar,butter,milk,gin,tartar"


In [19]:
# write to csv
metadata.to_csv(
    "../final-dataset/metadata/food101_metadata.csv",
    sep="\t",
    index=False,
    float_format="%.4f",
)

## Testing

In [8]:
category_ingredients_tally["apple_pie"][1].most_common()[:10]

[('apple', 53),
 ('cinnamon', 49),
 ('flour', 40),
 ('sugar', 38),
 ('butter', 30),
 ('salt', 24),
 ('pie', 23),
 ('nut', 22),
 ('lemon', 21),
 ('egg', 13)]

In [9]:
get_ingredients("apple_pie")

['apple', 'cinnamon', 'flour', 'sugar', 'butter', 'salt', 'date', 'maple']

In [19]:
demo_ingredients = get_ingredients("apple_pie")
get_dish_nutrition(demo_ingredients, ingredients_nutrition_table)

[2.9062, 0.6426000000000001, 0.05500000000000001, 0.0164]