In [1]:
import sys

sys.path.insert(0, "../../Ingredient Embeddings")

In [2]:
import shutil
import uuid
from pathlib import Path
from pprint import pprint

import pandas as pd
from ipynb.fs.full.ingredient_embeddings_similarity import exported as Ingredients
from IPython.display import Image

Module ingredient_embeddings_similarity.ipynb is loaded


In [3]:
# read all files
with open("./annotations/train_images.txt", "r") as file:
    train_images_txt = file.readlines()

with open("./annotations/train_labels.txt", "r") as file:
    train_labels_txt = file.readlines()

with open("./annotations/ingredients_simplified_Recipes5k.txt", "r") as file:
    ingredients_simplified_txt = file.readlines()

In [4]:
filtered = set(
    [
        "& half",
        "asian",
        "baking",
        "balls",
        "barbecue",
        "bbq",
        "blackening",
        "bulb",
        "crisps",
        "chop",
        "coarse",
        "concentrate",
        "double",
        "dogs",
        "dress russian",
        "fat",
        "fri",
        "fry",
        "heart",
        "italian",
        "jack",
        "kahl\\u00faa",
        "ling",
        "meal",
        "mexican",
        "mie",
        "min",
        "nonstick spray",
        "non stick spray",
        "organic",
        "pan drippings",
        "p\\u00e2t\\u00e9",
        "port",
        "protein",
        "preserves" "regular",
        "riso",
        "rocket",
        "rounds",
        "rub",
        "squash",
        "smoke",
        "spring",
        "sponge",
        "squirt",
        "stew",
        "stout",
        "v8",
        "vitamin",
        "well",
        "whipped",
        "whipped topping",
        "whipping",
    ]
)
mapped = {
    "abura age": "tofu",
    "aburage": "tofu",
    "ahi": "ahi tuna",
    "aioli": "garlic",
    "angel hair": "spaghetti",
    "allspice": "pepper",
    "asiago": "asiago cheese",
    "ancho": "anchovies",
    "anchovy": "anchovies",
    "bay": "bay leaf",
    "ball park franks": "frankfurters",
    "bawang goreng": "onion",
    "beet": "beets",
    "beans": "bean",
    "biscuit": "biscuits",
    "brewed espresso": "espresso",
    "bottom round": "bottom round steak",
    "bulk italian sausag": "italian sausag",
    "ch\\u00e8vre": "chevre cheese",
    "chevre": "chevre cheese",
    "chilli": "chili",
    "chuck": "chuck roast",
    "clam": "clams",
    "clove": "cloves",
    "cola": "coke",
    "country crock\\u00ae spread": "butter",
    "cr\\u00e8me de menthe": "creme de menthe",
    "cr\\u00e8me fra\\u00eeche": "creme fraiche",
    "crackers": "crackers",
    "cracker meal": "crackers",
    "corn-on-the-cob": "corncobs",
    "cornflake": "cornflakes",
    "cold water": "water",
    "dijon": "dijon mustard",
    "deveined shrimp": "shrimp",
    "dutch process cocoa": "cocoa",
    "dutch-processed cocoa": "cocoa",
    "fettucine": "fettuccine",
    "filo dough": "phyllo dough",
    "fillo dough": "phyllo dough",
    "flank": "flank steak",
    "flax meal": "flaxseed",
    "flax": "flaxseed",
    "frankfurter": "frankfurters",
    "gew\\u00fcrztraminer": "gewurztraminer",
    "gelatine": "gelatin",
    "green chile": "green chilli",
    "green chiles": "green chilli",
    "green chilies": "green chilli",
    "grit": "grits",
    "gyoza skins": "gyoza wrappers",
    "hellmann' best food mayonnais": "mayonnaise",
    "hellmann' best food real mayonnais": "mayonnaise",
    "hoagi rolls": "hoagi roll",
    "kampyo": "dried gourd strips",
    "kanpyo": "dried gourd strips",
    "kecap manis": "indonesian sweet soy sauce",
    "kernel corn": "corn kernel",
    "kim chee": "kimchi",
    "lady fingers": "ladyfingers",
    "liqueur": "liquor",
    "luke warm water": "water",
    "mccormick\\u00ae vanilla": "vanilla extract",
    "mirin": "rice wine",
    "mission\\u00ae gluten tortillas": "tortillas",
    "neufch\\u00e2tel": "neufchatel cheese",
    "mussels": "mussel",
    "old el paso\\u2122 green chiles": "green chiles",
    "oreo\\u00ae cookies": "oreo cookies",
    "pillsbury\\u2122 crescent dinner rolls": "dinner rolls",
    "phyllo": "phyllo dough",
    "poblano": "poblano chiles",
    "pretzels": "pretzel",
    "quickcooking grits": "grits",
    "rose water": "rosewater",
    "reese's": "reese's peanut butter cups",
    "sandwiches": "sandwich",
    "sushi grade tuna": "ahi tuna",
    "serrano chilies": "serrano chile",
    "shell": "shells",
    "shell-on shrimp": "shrimp",
    "snickers": "snickers bars",
    "stouffer''s lasagna": "lasagna",
    "store-bought ravioli": "ravioli",
    "tonkatsu": "pork cutlet",
    "top ramen": "ramen",
    "thai chile": "thai chili",
    "turbinado": "brown sugar",
    "uncook shrimp": "shrimp",
    "waffles": "waffle",
    "warm water": "water",
    "won ton wrappers": "wonton wrappers",
    "wonton skins": "wonton wrappers",
    "yoghurt": "yogurt",
    "wish chunki blue chees dress": "blue cheese dressing",
    "yellow food coloring": "food colouring",
}


def get_unique_ingredients(ingredient_list):
    unique_ingredient = set()
    for x in ingredients_simplified_txt:
        unique_ingredient.update(x.strip().split(","))
    return [*unique_ingredient]  # return a list by unpacking all values in set


def preprocess_ingredients(ingredient, filtered=filtered, mapped=mapped):
    if ingredient in filtered:
        return None
    mapped_name = mapped.get(ingredient)
    if mapped_name is None:
        return ingredient
    return mapped_name


def preprocess_all_ingredients(ingredients_list):
    unique_ingredient = set()
    for ingredient in ingredients_list:
        new_name = preprocess_ingredients(ingredient)
        if new_name != None:
            unique_ingredient.add(new_name)
    return [*unique_ingredient]


def build_nutrition_table(ingredients_list):
    rows = []
    for ingredient in ingredients_list:
        embedding = Ingredients.embed([ingredient])
        (
            fndds_similarity,
            fndds_index,
            fndds_category,
        ) = Ingredients.get_most_similar_from_fndds(embedding)
        (
            nutrition5k_similarity,
            nutrition5k_index,
        ) = Ingredients.get_most_similar_from_nutrition5k(embedding)
        row = []
        if fndds_similarity > 0.7 and fndds_similarity > nutrition5k_similarity:
            ingredient_nutrition = Ingredients.get_ingredient_nutrient_from_fndds(
                fndds_category, fndds_index
            )
            row.extend(
                [
                    ingredient,
                    ingredient_nutrition["Energy (kcal)"],
                    ingredient_nutrition["Carbohydrate (g)"],
                    ingredient_nutrition["Protein (g)"],
                    ingredient_nutrition["Total Fat (g)"],
                ]
            )
        elif nutrition5k_similarity > 0.7:
            ingredient_nutrition = Ingredients.get_ingredient_nutrient_from_nutrition5k(
                nutrition5k_index
            )
            row.extend(
                [
                    ingredient,
                    ingredient_nutrition["cal/g"],
                    ingredient_nutrition["carb(g)"],
                    ingredient_nutrition["protein(g)"],
                    ingredient_nutrition["fat(g)"],
                ]
            )
        else:
            row.extend([ingredient, None, None, None, None])
        rows.append(row)
    return pd.DataFrame(
        rows,
        columns=["Ingredient", "Calorie (kcal)", "Carbs (g)", "Protein (g)", "Fat (g)"],
    )

In [5]:
unique_ingredients = get_unique_ingredients(ingredients_simplified_txt)
unique_ingredients = preprocess_all_ingredients(unique_ingredients)

In [7]:
nutrition_table = build_nutrition_table(unique_ingredients)

In [13]:
# filter out those without nutrition values
notnull_mask = pd.notnull(nutrition_table["Calorie (kcal)"])
nutrition_table = nutrition_table[notnull_mask]

In [16]:
nutrition_table

Unnamed: 0,Ingredient,Calorie (kcal),Carbs (g),Protein (g),Fat (g)
3,agave nectar,3.100,0.760,0.001,0.005
4,ahi tuna,1.300,0.000,0.290,0.006
6,albacore,1.300,0.000,0.290,0.006
7,albacore tuna water,1.300,0.000,0.290,0.006
8,ale,0.430,0.036,0.005,0.000
...,...,...,...,...,...
882,yellow cornmeal,3.700,0.793,0.070,0.018
883,yellowfin tuna,1.300,0.000,0.290,0.006
885,yogurt,0.589,0.036,0.100,0.004
886,yolk,3.228,0.036,0.159,0.267


In [34]:
def move_image(path):
    src_dir = Path("./images")
    parent_dest_dir = Path("../final-dataset/images")
    file_name = str(int(uuid.uuid4()))
    src_path = src_dir / path.strip()
    dir_name = src_path.parent.name
    # check if dir containing file exists in destination
    dest_dir = parent_dest_dir / dir_name
    if not dest_dir.exists():
        dest_dir.mkdir()
    dest_path = dest_dir / src_path.name
    dest_path = dest_path.with_stem(file_name)
    shutil.copy(src_path, dest_path)
    return file_name


def get_dish_nutrition(index, ingredients_lookup):
    total_carbs = 0
    total_protein = 0
    total_fat = 0
    total_calorie = 0
    total_ingredient = 0
    ingredients = ingredients_lookup[index].strip()
    ingredients_list = ingredients.split(",")
    final_ingredients = []
    for ingredient in ingredients_list:
        ingredient_name = preprocess_ingredients(ingredient)
        if ingredient_name is None:
            continue
        final_ingredients.append(ingredient_name)
        row_in_nutrition_table = nutrition_table[
            nutrition_table["Ingredient"] == ingredient_name
        ]
        if len(row_in_nutrition_table) == 0:
            continue
        total_ingredient += 1
        total_calorie += row_in_nutrition_table["Calorie (kcal)"].values[0]
        total_carbs += row_in_nutrition_table["Carbs (g)"].values[0]
        total_protein += row_in_nutrition_table["Protein (g)"].values[0]
        total_fat += row_in_nutrition_table["Fat (g)"].values[0]
    return [
        ",".join(final_ingredients),
        total_calorie / total_ingredient,
        total_carbs / total_ingredient,
        total_protein / total_ingredient,
        total_fat / total_ingredient,
    ]


def preprocess(image_path, labels):
    rows = []
    for path in image_path[:1]:
        move_image(path)

        # getting ingredients and calculate the nutrient

In [36]:
get_dish_nutrition(2, ingredients_simplified_txt)

['onion,flour,egg,bread,oil', 3.4228, 0.257, 0.0746, 0.23159999999999997]