In [1]:
import os
import pickle
import pandas as pd
import json
import pprint

from calc_recipe_ingredient_info_distances import collectSomeRecipeRecommendations, get_all_comments, get_all_mutual_info, evalRecommendations, getRecommendationsBasedOnMutualInformationRole, get_graph_nodes, get_all_gt_recipes, get_recipes_per_ingredient, get_recipes_per_ingredient_pairs, get_all_frequencies,getNaiveBayesRecommendations

from recipe_clustering_helpers import (
    create_one_hot_ingredients_per_df, getRecipeIdsForSubTuples, get_hamming_distances
)

In [2]:
ORDERED_RECIPE_IDS_PATH = os.path.abspath("./outputs/sorted_recipe_ids_list.pkl")
TRAIN_COMMENTS_PATH = os.path.abspath("./inputs/train_comments_subs.pkl") # train recipes with substitutions
TEST_COMMENTS_PATH = os.path.abspath("./inputs/test_comments_subs.pkl") # test recipes with substitutions
VAL_COMMENTS_PATH = os.path.abspath("./inputs/val_comments_subs.pkl") # validation recipes with substitutions
GRAPH_NODES_PATH = os.path.abspath("./inputs/graph/nodes_191120.csv")

MUTUAL_INFO_DICT_PATH = os.path.abspath("./outputs/mutual_info_dict_with_self_info.pkl")
RECIPES_PER_INGREDIENT_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_small.pkl"
)
RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_pairs_small.pkl"
)
PROCESSED_RECIPES_PATH = os.path.abspath("./outputs/processed_recipes.pkl")
PATH_ONE_HOT_RECIPE_INGREDIENTS = os.path.abspath("./outputs/one_hot_recipe_ingredients.pkl")

EXTENDED_RECIPES_PATH = os.path.abspath("./inputs/extended_recipes_with_instructions_and_titles.json")



In [3]:

if os.path.isfile(ORDERED_RECIPE_IDS_PATH):
    with open(ORDERED_RECIPE_IDS_PATH, "rb") as file:
        ordered_recipe_ids = pickle.load(file)
recipe_ids_with_ranks = ordered_recipe_ids
ordered_recipe_ids = [recipe[1] for recipe in recipe_ids_with_ranks]

# recipes with comments, lists, names, and gt_truths
with open(EXTENDED_RECIPES_PATH, 'r') as recipe_extended_with_original_info:
    extended_recipes = json.load(recipe_extended_with_original_info)
# extended_recipes = get_all_comments(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH, VAL_COMMENTS_PATH, EXTENDED_RECIPES_PATH)
recipes_extended_dict = {recipe["id"]:recipe for recipe in extended_recipes}


# recipe ingredient df one hot
# INGREDIENT_RECIPE_MATRIX_PATH = os.path.abspath("./outputs/ingredient_recipe_matrix.pkl")
# recipe_ingredient_df = get_recipe_ingredient_df( ingredients, recipes, recipes_per_ingredient, INGREDIENT_RECIPE_MATRIX_PATH)
# recipe_ingredient_df_bool = recipe_ingredient_df.astype(bool)

with open(MUTUAL_INFO_DICT_PATH, "rb") as mutual_info_dict_file:
            mutual_info_dict = pickle.load(mutual_info_dict_file)

ingredients = get_graph_nodes(GRAPH_NODES_PATH)

recipes = get_all_gt_recipes(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH,
                                VAL_COMMENTS_PATH, PROCESSED_RECIPES_PATH)
recipes_per_ingredient = get_recipes_per_ingredient(
    ingredients, recipes, RECIPES_PER_INGREDIENT_SMALL_PATH)

recipes_per_ingredient_pairs = get_recipes_per_ingredient_pairs(
    recipes_per_ingredient, RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH)
recipe_ingredient_counts, recipe_ingredient_pair_counts = get_all_frequencies(recipes_per_ingredient,
                                        recipes_per_ingredient_pairs)


# Checking user comments GT substitution pairs

## Which GT tuple ingredients are not represented as nodes?

In [4]:
all_gt_substitution_tuples = [recipe['subs'] for recipe in extended_recipes]
all_gt_substitution_tuples = sorted(all_gt_substitution_tuples, key=lambda x: x[0])
all_gt_substitution_tuples
print(f"number of GT substitution tuples from user comments: {len(all_gt_substitution_tuples)}")
ingredients_that_are_not_nodes = []
for sub_tuple in all_gt_substitution_tuples:
    if sub_tuple[0] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[0])
    if sub_tuple[1] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[1])
print(f"number of ingredients that are not nodes: {len(ingredients_that_are_not_nodes)}")
print(f"list of ingredients that are not nodes: {list(set(ingredients_that_are_not_nodes))}")

number of GT substitution tuples from user comments: 70520
number of ingredients that are not nodes: 2583
list of ingredients that are not nodes: ['yogurt_covered_raisins', 'mustard_greens', 'roasted_sunflower_seeds', 'apricot_pineapple_preserves', 'margarine_spread', 'spinach_leaves', 'cilantro_leaves', 'whole_grain_rice', 'soy_protein', 'refrigerated_crescent_dinner_rolls', 'chicken_breast_fillet', 'jerk_sauce', 'lemon_marmalade', 'greens', 'peach_baby_food', 'pineapple_juice_concentrate', 'minute_white_rice', 'powdered_turmeric', 'liquid_shortening', 'sweet_almond_oil', 'hot_ketchup', 'whole_wheat_orzo', 'nestle_milk_chocolate_and_peanut_butter_swirled_morsels', 'chocolate_chip_ice_cream', 'dry_crushed_red_pepper', 'saltines', 'ginger_puree', 'frozen_tater_tots', 'soy_sour_cream', 'reduced_fat_cream_cheese', 'pork_gravy_mix', 'red_pepper_pod', 'chocolate_malt_powder', 'lemon_infused_olive_oil', 'green_chartreuse_liqueur', 'popcorn_shrimp', 'graham_cracker_sticks', 'beef_ravioli', 'b

## How often do ground truth substitution tuples appear in recipes?

In [5]:
ground_truth_tuple_recipes = getRecipeIdsForSubTuples(extended_recipes)
ground_truth_tuple_counts = {gt_sub: len(recipe_list) for gt_sub, recipe_list in ground_truth_tuple_recipes.items()}
ground_truth_tuple_counts = dict(sorted(ground_truth_tuple_counts.items(), key=lambda item: item[1], reverse=True))


In [6]:
print(list(ground_truth_tuple_counts.items())[:5])
print(list(ground_truth_tuple_counts.items())[-5:])
#('ground_beef', 'ground_turkey'): 253,
#('vegetable_oil', 'applesauce'): 240,
#('heavy_cream', 'milk'): 219,
#('plain_yogurt', 'sour_cream'): 168,
#('butter', 'applesauce'): 133, # could be interesting because it will not always work
#('parsley', 'cilantro'): 132, # this too
#('salt', 'garlic_salt'): 115,

#('beef', 'boneless_skinless_chicken_breast'): 8,
#('beer', 'beef_broth'): 8,
#('frozen_spinach', 'broccoli'): 8,
#('red_wine', 'red_wine_vinegar'): 8,
#('lean_ground_beef', 'veggie_crumbles'): 8,
#('chicken', 'ground_turkey'): 2, # rare but should often fit, except the preparation could make it different?
#('carrot', 'green_bell_pepper'): 8,
#('mushroom', 'red_bell_pepper'): 7,


[(('butter', 'olive_oil'), 547), (('water', 'chicken_broth'), 412), (('walnut', 'pecan'), 392), (('sugar', 'honey'), 373), (('margarine', 'butter'), 349)]
[(('dried_peppermint', 'dried_marjoram'), 1), (('unsweetened_applesauce', 'low_fat_plain_yogurt'), 1), (('vegetable_stock', 'green_bean'), 1), (('diced_tomato', 'potato'), 1), (('dried_thyme', 'mixed_herb'), 1)]


In [7]:
print(ground_truth_tuple_recipes[('egg_noodle', 'macaroni_noodle')])
print(ground_truth_tuple_counts[('egg_noodle', 'macaroni_noodle')])

['32ac082c31', 'dd43625438', '0fe9a1105b', 'beb22afa59', '99ae52fa77', '5c7aeac637', 'f472881b2e', '4f73ae2f24']
8


In [8]:
one_hot_recipe_ingredients, failed_recipes = create_one_hot_ingredients_per_df(extended_recipes, ingredients, PATH_ONE_HOT_RECIPE_INGREDIENTS)
failed_recipes

[]

In [9]:
recipe_id = "ffb1c2b5ad"
distances_hamming = get_hamming_distances(recipe_id, one_hot_recipe_ingredients)


In [10]:
distances_hamming = list(set(distances_hamming))
distances_hamming = sorted(distances_hamming, key=lambda x: x[1])
distances_hamming

[('999d8c4b0d', 7),
 ('4b6a3ba513', 7),
 ('3ce8bc4f90', 8),
 ('b23374a4b9', 8),
 ('b26281c482', 8),
 ('1602aa6218', 8),
 ('d500143819', 8),
 ('814df4eedd', 8),
 ('3abd84b687', 8),
 ('45a362206d', 8),
 ('703bafb843', 8),
 ('48dc15499a', 8),
 ('06f24af2ac', 8),
 ('9297c2871e', 8),
 ('e0702bd6b5', 9),
 ('76c50a9298', 9),
 ('788aaab39a', 9),
 ('0cb46cd1b6', 9),
 ('f1d1c4dd5a', 9),
 ('8f55c987f3', 9),
 ('ac88507fc0', 9),
 ('669a68b20b', 9),
 ('1bfacdd84b', 9),
 ('b4d4181e59', 9),
 ('9186f9c766', 9),
 ('836b5e02d8', 9),
 ('bef14a917a', 9),
 ('44353457b0', 9),
 ('10c95542ba', 9),
 ('0f7fa9360a', 9),
 ('e84d6756ee', 9),
 ('ae4f8b7915', 9),
 ('2196780e17', 9),
 ('80c61e32a8', 9),
 ('a5118277b6', 9),
 ('e76ad6f09a', 9),
 ('bfa5a44e0a', 9),
 ('74c68c90da', 9),
 ('5ac33cedde', 9),
 ('5274a67915', 9),
 ('3b779cd38b', 9),
 ('1ed61bae3f', 9),
 ('81efe2efbd', 9),
 ('b17964c27e', 9),
 ('a8588618df', 9),
 ('4b004edfad', 9),
 ('a3d78df746', 9),
 ('af0f5ab387', 9),
 ('e1015b6199', 9),
 ('6438193959', 9),


In [11]:
def getRecipeFromComments(recipeId, allComments):
    recipe = None
    for commentRecipe in allComments:
        gt_ingredient_found = False
        comment_recipe_id = commentRecipe['id']
        if recipeId == comment_recipe_id:
            if recipe is None:
                recipe = {}
                recipe["id"] = comment_recipe_id
                recipe["subs"] = []
                recipe["ingredients"] = commentRecipe["ingredients"]

            if recipe is not None:
                recipe["subs"].append(commentRecipe["subs"])

    return recipe

# def getRecipeIdsForGTSub(gt_sub, extended_recipes):
#     recipe_ids = []
#     for recipe in extended_recipes:
#         recipe_id = recipe["id"]
#         subs = recipe["subs"]
#         if isinstance(subs, list) and gt_sub in subs:
#             recipe_ids.append(recipe_id)
#         elif gt_sub == subs:
#             recipe_ids.append(recipe_id)
#     return recipe_ids

In [12]:
# sample_gt_sub = ('chicken', 'ground_turkey')
# sample_recipe_ids = getRecipeIdsForGTSub(sample_gt_sub, extended_recipes)
# sample_recipe_ids

In [13]:
# r1 = getRecipeFromComments("ffb1c2b5ad", extended_recipes)
# r2 = getRecipeFromComments("999d8c4b0d", extended_recipes)
# r3 = getRecipeFromComments("4b6a3ba513", extended_recipes)

print(json.dumps(recipes_extended_dict["ffb1c2b5ad"], indent=4))

{
    "id": "ffb1c2b5ad",
    "ingredients": [
        [
            "flour"
        ],
        [
            "puff_pastry_sheet",
            "puff_pastry_sheets"
        ],
        [
            "tuna"
        ],
        [
            "onion",
            "onions",
            "round_onion"
        ],
        [
            "celery"
        ],
        [
            "mayonnaise",
            "lemon_mayonnaise",
            "wasabi_mayonnaise",
            "canola_oil_mayonnaise"
        ],
        [
            "salt_and_pepper"
        ],
        [
            "american_cheese"
        ],
        [
            "egg_wash"
        ]
    ],
    "subs": [
        "onion",
        "scallion"
    ],
    "instructions": [
        "Preheat the oven to 375F",
        "On a floured surface, roll out puff pastry sheets into 12- by 18-inch rectangles.",
        "Slide them onto a baking pan and place in the refrigerator.",
        "Let them settle about one hour, until firm.",
        "While the 

## Check for some Common side ingredients

In [68]:
PASTA = ["pasta", "noodle"]
FLOUR = ["flour"]
RICE = ["rice"]
BREAD = ["bread"]
POTATO = ["potato"]
OIL = ["oil"]
OIL = ["oil", "butter", "cream"]
SPICES = ["salt", "sugar", "celery"] #maybe pepper? maybe mustard? but they could miss many interesting recipes but we can remove pepper for pepper for sure
MISC = ["water", "egg", "baking_powder", "baking_soda", "juice", "extract", "seed", "mix", "syrup", "milk"]

BLACKLIST_PHRASES = [*PASTA, *FLOUR, *RICE, *BREAD, *POTATO, *OIL, *SPICES, *MISC]
print(f"blacklist phrases: {BLACKLIST_PHRASES}")

def getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, common_side_ingredients, exclude_keywords):
    matched_ingredients = []
    for ingredient in ingredients:
        for side_ingredient in common_side_ingredients:
            if len(exclude_keywords) < 1:
                if side_ingredient in ingredient and ingredient not in matched_ingredients:
                    matched_ingredients.append(ingredient)
            else:
                for exclude_keyword in exclude_keywords:
                    if side_ingredient in ingredient and ingredient not in matched_ingredients and exclude_keyword not in ingredient:
                        matched_ingredients.append(ingredient)
    return matched_ingredients

def checkIngredientContainsBlacklistPhrase(ingredient, blacklist_phrases, whitelist_phrases = []):
    for blacklist_phrase in blacklist_phrases:
        if blacklist_phrase in ingredient:
            for whitelist_phrase in whitelist_phrases:
                if whitelist_phrase in ingredient:
                    return False
            return True
    return False

checked_for_sugga = checkIngredientContainsBlacklistPhrase("sugar, granulated", BLACKLIST_PHRASES)
print(f"Did granulated sugar get blacklisted? {checked_for_sugga}")
checked_for_steve = checkIngredientContainsBlacklistPhrase("jobs, steve", BLACKLIST_PHRASES)
print(f"Did steve jobs get blacklisted? {checked_for_steve}")

pasta_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, PASTA, ["sauce"])
flour_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, FLOUR, [])
rice_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, RICE, [])
bread_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, BREAD, [])
potato_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, POTATO, [])
oil_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, OIL, [])
spices_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, SPICES, [])
misc_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, MISC, [])


print(len(pasta_matched_ingredients))
print(len(flour_matched_ingredients))
print(len(rice_matched_ingredients))
print(len(bread_matched_ingredients))
print(len(potato_matched_ingredients))
print(len(oil_matched_ingredients))
print(len(spices_matched_ingredients))
print(len(misc_matched_ingredients))

blacklist phrases: ['pasta', 'noodle', 'flour', 'rice', 'bread', 'potato', 'oil', 'butter', 'cream', 'salt', 'sugar', 'celery', 'water', 'egg', 'baking_powder', 'baking_soda', 'juice', 'extract', 'seed', 'mix', 'syrup', 'milk']
Did granulated sugar get blacklisted? True
Did steve jobs get blacklisted? False
106
95
106
128
64
349
198
691


# Generating Recipe Set
1. get extended recipes
1. Filter GT samples according to frequency of pairs, ingredients, baseline model prediction quality
1. Fitler recipes based on if the gt source is in the recipe (? This is more relevant for the extension of the recipe)
1. Filter based on trivial substitutions (but how?)
1. Filter based on assumed importance of the ingredient to the recipe (but how?)
1. Filter according to additional features, like substituting one food class with another or with the same food class


In [15]:
# from the list of the most common sub tuples, remove the ones with the most common ingredients and most common combined ingredient counts (for the least common tuples vice versa)

ingredient_frequency_theshold = 3

recipes_per_ingredient_filtered = {ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)>3}
filtered_out_uncommon_ingredients = list({ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)<=3}.keys())
recipe_ingredient_counts_filtered = {recipe_id: len(recipes) for recipe_id, recipes in recipes_per_ingredient_filtered.items()}
recipe_ingredient_counts_filtered = dict(sorted(recipe_ingredient_counts_filtered.items(), key=lambda item: item[1], reverse=True))
total_ingredient_count_filtered = len(recipe_ingredient_counts_filtered)
print(total_ingredient_count_filtered)

very_most_common_ingrs = list(recipe_ingredient_counts_filtered.keys())[:38]
print(f"Top most common ingredients: {very_most_common_ingrs}")
print(f"Filtered out uncommon ingredients: {filtered_out_uncommon_ingredients}")

3842
Top most common ingredients: ['salt', 'butter', 'onion', 'egg', 'sugar', 'garlic_clove', 'water', 'olive_oil', 'flour', 'milk', 'pepper', 'baking_powder', 'brown_sugar', 'baking_soda', 'all_purpose_flour', 'vegetable_oil', 'carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley']
Filtered out uncommon ingredients: ['100%_bran', '12_inch_pizza_crust', '2%_buttermilk', '2%_evaporated_milk', '2%_mozzarella_cheese', '6_inch_corn_tortilla', '85%_lean_ground_beef', '8_inch_flour_tortilla', '9"_pastry_pie_shell', '9"_unbaked_pie_shell', '96%_lean_ground_beef', '9_inch_baked_pie_crust', '9_inch_graham_cracker_crust', 'a.1._original_sauce', 'abalone', 'absinthe', 'absolut_citron_vodka', 'achiote', 'achiote_oil', 'achiote_paste', 'achiote_powder', 'acor

In [16]:
print("Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) \n")
print(len(ground_truth_tuple_counts.keys()), "\n")

trivial_subs = []
def checkTrivialSubs(sub_tuple):
    ingr_1, ingr_2 = sub_tuple
    if ingr_1 in ingr_2 or ingr_2 in ingr_1:
        trivial_subs.append(sub_tuple)
        return True
    else:
        return False

unimportant_subs = []
def checkUnimportantSource(sub_tuple, very_most_common_ingrs):
    ingr_1, ingr_2 = sub_tuple
    for very_common_ingr in very_most_common_ingrs:
        if very_common_ingr in ingr_1:
            unimportant_subs.append(sub_tuple)
            return True
        else:
            return False

esoteric_subs = []
def check_esotericSource(sub_tuple, filtered_out_uncommon_ingredients):
    ingr_1, ingr_2 = sub_tuple
    for esoteric_ingr in filtered_out_uncommon_ingredients:
        if esoteric_ingr in ingr_1:
            esoteric_subs.append(sub_tuple)
            return True
        else:
            return False

### we could also remove subst that partially overlap, but that potentially removes many interesting substitutions (like "beef roast - venison roast" maybe) but we could try

ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkTrivialSubs(gt_tuple)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkUnimportantSource(gt_tuple, very_most_common_ingrs)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not check_esotericSource(gt_tuple, filtered_out_uncommon_ingredients)}

print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in pasta_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in flour_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in rice_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in bread_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in potato_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in oil_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in spices_matched_ingredients}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in spices_matched_ingredients} # throw away spicy tail
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in misc_matched_ingredients}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in misc_matched_ingredients} # throw away miscelanious tail
print(len(ground_truth_tuple_counts), "\n")

total_nr_sub_tuples = len(ground_truth_tuple_counts.keys())
print(f"Total number of remainingsubstitution tuples: {total_nr_sub_tuples}")
# print(f"nr of trivial substitutions {len(trivial_subs)}")
# print(f"nr of unimportant substitutions {len(unimportant_subs)}")

most_common_sub_tuples = list(ground_truth_tuple_counts.keys())[:total_nr_sub_tuples // 4]
least_common_sub_tuples = list(ground_truth_tuple_counts.keys())[total_nr_sub_tuples // 4 * 3:]
# print(len(most_common_sub_tuples))
# print(len(least_common_sub_tuples))

Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) 

30832 

29410 

29090 

28393 



27996 

27391 

27044 

26270 

24642 

21115 

Total number of remainingsubstitution tuples: 21115


## get most and least common ingredients

In [17]:


most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[:total_ingredient_count_filtered // 4]
least_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[total_ingredient_count_filtered // 4 * 3:]

# make upper ban list until index 16
most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[16:total_ingredient_count_filtered // 4]
# limit occurence of lower edge ingredients to being required to occur in 10(?) recipes?
print(f"The most common remaining ingredients: {most_common_ingredients}")
print("\n")
print(f"The least common remaining ingredients{least_common_ingredients}")

The most common remaining ingredients: ['carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley', 'paprika', 'cornstarch', 'ground_beef', 'chili_powder', 'worcestershire_sauce', 'unsalted_butter', 'fresh_ground_black_pepper', 'walnut', 'parsley', 'ground_cinnamon', 'nutmeg', 'cayenne_pepper', 'lemon', 'margarine', 'bacon', 'extra_virgin_olive_oil', 'green_pepper', 'red_onion', 'ground_cumin', 'cheddar_cheese', 'granulated_sugar', 'bay_leaf', 'ground_black_pepper', 'raisin', 'pecan', 'boneless_skinless_chicken_breast', 'tomato_sauce', 'heavy_cream', 'banana', 'red_bell_pepper', 'zucchini', 'dried_oregano', 'cumin', 'mozzarella_cheese', 'canola_oil', 'dijon_mustard', 'kosher_salt', 'mushroom', 'tomato_paste', 'diced_tomato', 'chicken_stock', 'dry_par

In [18]:
# print(len(most_common_sub_tuples))
top_tuples_top_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in most_common_ingredients]
top_tuples_bot_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in least_common_ingredients]

bot_tuples_top_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in most_common_ingredients]
bot_tuples_bot_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in least_common_ingredients]

print(f"top tuples - top sources {len(top_tuples_top_sources)}")
print(f"top tuples - bot sources {len(top_tuples_bot_sources)}")
print(f"bot tuples - tot sources {len(bot_tuples_top_sources)}")
print(f"bot tuples - bot sources {len(bot_tuples_bot_sources)}")

top tuples - top sources 4187
top tuples - bot sources 46
bot tuples - tot sources 3261
bot tuples - bot sources 208


In [19]:
bot_tuples_bot_sources[:5]

[('bacon_piece', 'bacon_bit'),
 ('passata', 'diced_tomato'),
 ('veg_all', 'carrot'),
 ('hot_pork_sausage', 'turkey_sausage'),
 ('sweet_baking_chocolate', 'canola_oil')]

In [20]:
bot_tuples_top_sources[:5]

[('crabmeat', 'baked_tofu'),
 ('dry_sherry', 'cooking_wine'),
 ('cucumber', 'brine'),
 ('raisin', 'coconut_flake'),
 ('semi_sweet_chocolate_chip', 'pumpkin_pie_spice')]

In [21]:
top_tuples_bot_sources[:5]

[('plum_jam', 'apricot_jam'),
 ('vegemite', 'marmite'),
 ('pepsi', 'coke'),
 ('lean_ground_chicken', 'ground_turkey'),
 ('yellow_hominy', 'corn')]

In [22]:
recipe_ingredient_pair = ('hot_pork_sausage', 'turkey_sausage')
recipe_ids = recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')]
print(recipe_ids)

[]


In [23]:
print(recipes_per_ingredient_pairs[('1%_fat_buttermilk', '1%_fat_cottage_cheese')])
print(recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')])
print(recipes_per_ingredient_pairs[('cooked_ham', 'pork')])
# print(recipes_per_ingredient_pairs[('pork', 'cooked_ham')])
# I should check that... if, for a recipe, there are gt subst for which not both ingredients are contained in the recipe ingredient set, then this must be filtered (and can also help maybe understanding some of the performance of the gismo model)

['d7c2a81e73', '8886e63259']
[]
[]


# Get additional Recipes with nutritional information

In [24]:
with open("C:/UM/Master/FoodRecommendations/datasources/Recipe1M/recipes_with_nutritional_info.json", 'r') as recipes_original_with_nutr_info_path:
    recipes_original_with_nutritional_info = json.load(recipes_original_with_nutr_info_path)

recipes_orig_w_nutr_info_dict = {recipe["id"]: recipe for recipe in recipes_original_with_nutritional_info}

for recipe_id, recipe in recipes_orig_w_nutr_info_dict.items():
    recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"] = [ingr["text"] for ingr in recipe["ingredients"]]


def add_most_frequent_ingredient_info_to_orig_w_nutr_info(orig_recipes_w_nutr):
    for recipe_id, recipe in orig_recipes_w_nutr.items():
        ingredients_n_weight = {ingredient_name: ingredient_weight for ingredient_name, ingredient_weight in list(zip(recipe["ingredients"], recipe["weight_per_ingr"]))}
        ingredients_n_weight = dict(sorted(ingredients_n_weight.items(), key=lambda item: item[1], reverse=True))
        orig_recipes_w_nutr[recipe_id]["weight_per_ingr"] = ingredients_n_weight
        # then sort is
    return orig_recipes_w_nutr

recipes_orig_w_nutr_info_dict = add_most_frequent_ingredient_info_to_orig_w_nutr_info(recipes_orig_w_nutr_info_dict)

for i in range(5):
    print(list(recipes_orig_w_nutr_info_dict.items())[i])

('000095fc1d', {'fsa_lights_per100g': {'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}, 'id': '000095fc1d', 'ingredients': ['yogurt, greek, plain, nonfat', 'strawberries, raw', 'cereals ready-to-eat, granola, homemade'], 'instructions': [{'text': 'Layer all ingredients in a serving dish.'}], 'nutr_per_ingredient': [{'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}, {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}, {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}], 'nutr_values_per100g': {'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}, 'partition': 'train', 'quantity': [{'text': '8'}, {'text': '1'}, {'text': '1/4'}], 'title': 'Yogurt Parfaits', 'unit': [{'text': 'ounce'}, {'text': 'cup'},

In [25]:

matching_recipe_ids_from_gismo_and_nutriinfo = list(set(recipes_extended_dict.keys()) & set(recipes_orig_w_nutr_info_dict.keys()))

In [26]:
print(len(matching_recipe_ids_from_gismo_and_nutriinfo))

2498


## Verify that matching recipes are indeed the same

In [27]:
recipe_id = matching_recipe_ids_from_gismo_and_nutriinfo[273]
print(f"{recipes_orig_w_nutr_info_dict[recipe_id]['ingredients']}\n")
print(f"{recipes_extended_dict[recipe_id]}")

['oil, canola', 'cream, sour, cultured', 'cream, fluid, heavy whipping', 'vinegar, cider', 'salt, table', 'spices, garlic powder']

{'id': '6895cdce96', 'ingredients': [['canola_oil', 'polyunsaturated_oil'], ['sour_cream', 'soy_sour_cream'], ['heavy_cream'], ['cider_vinegar', 'fruit_vinegar'], ['salt', 'vegetable_salt', 'low_sodium_salt'], ['garlic', 'garlic_sprouts']], 'subs': ['garlic', 'garlic_paste'], 'instructions': ['Mix all together in jar with tight fitting lid.', 'Shake until mixed well.', 'Chill.'], 'title': 'Creamy Garlic Dressing', 'original_ingredients': ['1 cup canola oil', '12 cup sour cream', '14 cup heavy cream', '14 cup cider vinegar', '1 teaspoon salt', '2 tablespoons minced garlic']}


## Build map between gismo/flavorgraph ingredients and recipe1m with nutri info ingredients

Actually the recipes lists seem to be in the same order in both sources, but I wanted to try how good the matching via these other techniques is

In [28]:
def get_gizmo_name_from_reipce1mnutri_ingredient(ingredient_name, recipe_gismo, recipe_1mnutri):
    for gismo_ingr_name, recipe1mnutri_ingr_name in zip(recipe_gismo["ingredients"], recipe_1mnutri["ingredients"]):
        if recipe1mnutri_ingr_name == ingredient_name:
            return gismo_ingr_name
    raise Exception("ingredient could not be matched for {ingredient_name} in recipe {recipe['id']}")

def get_relative_weight_rank_of_gt_ingredient(gt_ingredient_gismo, recipe_gismo, recipe_1mnutri):
    ingredient_nutri = None
    for _ingr_gismo, _ingr_1mnutri in zip(recipe_gismo["ingredients"], recipe_1mnutri["ingredients"]):
        if isinstance(_ingr_gismo, list):
            for _ingr_var_gismo in _ingr_gismo:
                if _ingr_var_gismo == gt_ingredient_gismo:
                    ingredient_nutri = _ingr_1mnutri
        else:
            if _ingr_gismo == gt_ingredient_gismo:
                ingredient_nutri = _ingr_1mnutri
    if ingredient_nutri is None:
        return -1

    for rank, _ingr_name in enumerate(list(recipe_1mnutri["weight_per_ingr"].keys())):
        if ingredient_nutri == _ingr_name:
            return rank / len(recipe_1mnutri["weight_per_ingr"].keys())


# if relative rank <= 0.25:
#  add the gismo recipe and the gt sample to the list of candidate recipe substitutions


In [85]:
candidate_substitution_recipes_from_1mnutri = {} #key = substitution pair, value = recipe

in_bounds_counter = 0
out_of_bounds_counter = 0
for recipe_id in matching_recipe_ids_from_gismo_and_nutriinfo:
    recipe_gismo = recipes_extended_dict[recipe_id]
    recipe_1mnutri = recipes_orig_w_nutr_info_dict[recipe_id]
    gt_subs = recipe_gismo["subs"]

    if isinstance(gt_subs[0], list):
        for gt_sub in gt_subs:
            source_ingr = gt_sub[0]
            source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient(source_ingr, recipe_gismo, recipe_1mnutri)
            if source_relative_weight_rank <= 0.1:
                if not gt_sub in list(candidate_substitution_recipes_from_1mnutri.keys()):
                    candidate_substitution_recipes_from_1mnutri[gt_sub] = []
                candidate_substitution_recipes_from_1mnutri[gt_sub].append(recipe_gismo)
                in_bounds_counter += 1
            else:
                out_of_bounds_counter += 1
    else:
        gt_sub = gt_subs
        source_ingr = gt_sub[0]
        target_ingr = gt_sub[1]
        sub_tuple = (source_ingr, target_ingr)
        source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient(source_ingr, recipe_gismo, recipe_1mnutri)
        if source_relative_weight_rank <= 0.1:
            if not sub_tuple in list(candidate_substitution_recipes_from_1mnutri.keys()):
                candidate_substitution_recipes_from_1mnutri[sub_tuple] = []
            candidate_substitution_recipes_from_1mnutri[sub_tuple].append(recipe_gismo)
            in_bounds_counter += 1
        else:
            out_of_bounds_counter += 1

print(in_bounds_counter)
print(out_of_bounds_counter)

    # for gt_sub in gt_subs:
    #     source_ingr = gt_sub[0]
    #     source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient(source_ingr, recipe_gismo, recipe_1mnutri)
    #     if source_relative_weight_rank <= 0.25:
    #         if not gt_sub in list(candidate_substitution_recipes_from_1mnutri.keys()):
    #             candidate_substitution_recipes_from_1mnutri[gt_sub] = []
    #         candidate_substitution_recipes_from_1mnutri[gt_sub].append(recipe_gismo))

574
1924


In [88]:

## filter the remaining recipes with nutri info in which the gt source plays a majo role
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not checkIngredientContainsBlacklistPhrase(pair[0], BLACKLIST_PHRASES)}

## and filter out the very most common ingredients from the gismo dataset
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not pair[0] in very_most_common_ingrs}

print(f"number of substitution pairs for recipes in which the source is a major component: {len(candidate_substitution_recipes_from_1mnutri)}")
print(f"substitution pairs {list(candidate_substitution_recipes_from_1mnutri.keys())}")

number of substitution pairs for recipes in which the source is a major component: 184
substitution pairs [('tomato_paste', 'ketchup'), ('vegetable_stock', 'water'), ('vodka', 'white_wine'), ('cocoa', 'syrup'), ('rib', 'pork'), ('breakfast_cereal', 'graham_cracker'), ('peanut', 'walnut'), ('baby_carrot', 'dark_muscovado_sugar'), ('dried_cranberry', 'apricot'), ('blue_cheese', 'vinaigrette'), ('dried_cranberry', 'raisin'), ('sweet_gherkin', 'sweet_pickle_relish'), ('light_margarine', 'butter'), ('pork_roast', 'pork_loin_roast'), ('nonfat_plain_yogurt', 'buttermilk'), ('crushed_pineapple', 'fruit'), ('white_wine_vinegar', 'balsamic_vinegar'), ('country_style_pork_rib', 'marinade'), ('lemon_yogurt', 'vanilla_yogurt'), ('lean_ground_beef', 'ground_turkey'), ('turkey_sausage', 'turkey_pepperoni'), ('cornstarch', 'arrowroot'), ('slivered_almond', 'onion'), ('mango_chutney', 'peach_chutney'), ('unsweetened_applesauce', 'pumpkin'), ('raspberry_vinegar', 'red_wine_vinegar'), ('walnut', 'ground_

In [87]:
# try filtering out rare ingredients
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not pair[0] in least_common_ingredients}

print(f"number of substitution pairs for recipes in which the source is a major component: {len(candidate_substitution_recipes_from_1mnutri)}")
print(f"substitution pairs {list(candidate_substitution_recipes_from_1mnutri.keys())}")

number of substitution pairs for recipes in which the source is a major component: 184
substitution pairs [('tomato_paste', 'ketchup'), ('vegetable_stock', 'water'), ('vodka', 'white_wine'), ('cocoa', 'syrup'), ('rib', 'pork'), ('breakfast_cereal', 'graham_cracker'), ('peanut', 'walnut'), ('baby_carrot', 'dark_muscovado_sugar'), ('dried_cranberry', 'apricot'), ('blue_cheese', 'vinaigrette'), ('dried_cranberry', 'raisin'), ('sweet_gherkin', 'sweet_pickle_relish'), ('light_margarine', 'butter'), ('pork_roast', 'pork_loin_roast'), ('nonfat_plain_yogurt', 'buttermilk'), ('crushed_pineapple', 'fruit'), ('white_wine_vinegar', 'balsamic_vinegar'), ('country_style_pork_rib', 'marinade'), ('lemon_yogurt', 'vanilla_yogurt'), ('lean_ground_beef', 'ground_turkey'), ('turkey_sausage', 'turkey_pepperoni'), ('cornstarch', 'arrowroot'), ('slivered_almond', 'onion'), ('mango_chutney', 'peach_chutney'), ('unsweetened_applesauce', 'pumpkin'), ('raspberry_vinegar', 'red_wine_vinegar'), ('walnut', 'ground_

## Inspecting some of these recipes:

In [96]:
# good are [0, 122, 183]
# bad is [12, 144]
test_idx = 144
test_sub_pair = list(candidate_substitution_recipes_from_1mnutri.keys())[test_idx]
print(f"test sub pair: {test_sub_pair}")
test_recipe = candidate_substitution_recipes_from_1mnutri[test_sub_pair][0]
print(f"test recipe id {test_recipe}")

test sub pair: ('liquid_honey', 'maple_syrup')
test recipe id {'id': '4452e199a9', 'ingredients': [['peach', 'peaches', 'fresh_peach'], ['lemon_juice'], ['liquid_honey'], ['all_purpose_flour', 'unbleached_all_purpose_flour'], ['rolled_oat', 'rolled_oats'], ['brown_sugar'], ['cinnamon'], ['nutmeg'], ['butter', 'molly_mcbutter']], 'subs': ['liquid_honey', 'maple_syrup'], 'instructions': ['Place peaches in a 9 inch square pan.', 'Sprinkle with lemon juice and then drizzle with honey.', 'In medium bowl, mix together remaining ingredients using a pastry blender or fork.', 'Sprinkle mixture evenly over peaches.', 'Bake in preheated 375 degree oven for 35-40 minutes or until top is golden brown.'], 'title': 'Peach Crisp', 'original_ingredients': ['6 cups peaches, peeled and sliced (or canned peaches can be used)', '1 tablespoon lemon juice', '14 cup liquid honey', '1 cup all-purpose flour', '1 cup rolled oats', '13 cup firmly packed brown sugar', '12 teaspoon cinnamon', '14 teaspoon nutmeg', 

# Prepare the rest of the GISMO recipes to get weight, then calculate the most fitting recipes accordingly

# Some more ingredient matching functions (Should not be needed due to each ingredient lists ordering)

In [30]:
import difflib

gismo_to_recipe1mnutri = {}
recipe1mnutri_to_gismo = {}

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ing = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}
for item1 in ingredients1:
    best_match = difflib.get_close_matches(item1, ingredients2, n=1, cutoff=0.1)
    if best_match:
        matching_items[item1] = best_match[0]

    # for item2 in recipes_extended_dict[recipe_id]["ingredients"]:
    #     if isinstance(item2, list):
    #         item2 = item2[0]
    #     item2 = item2.replace('_', ' ')
    #     best_match = difflib.get_close_matches(item1, item2, n=1, cutoff=0.1)
    #     if best_match:
    #         matching_items[item1] = best_match[0]

for item, match in matching_items.items():
    print(f"{item} matches {match}")

brussels sprouts, raw matches fresh_brussels_sprout
vinegar, red wine matches sherry_wine_vinegar
mustard, prepared, yellow matches mustard
syrup, maple, canadian matches maple_syrup
oil, olive, salad or cooking matches sherry_wine_vinegar
spices, nutmeg, ground matches fresh_nutmeg
nuts, walnuts, english matches walnut_oil


In [31]:
import spacy
import datetime

# timestamp1 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp1}")
nlp = spacy.load("en_core_web_sm")

# timestamp2 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp2}")


ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ing = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}

for item1 in ingredients1:

    item1_tokens = set(token.text for token in nlp(item1))

    # timestamp3 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # print(f"timestamp1 {timestamp3}")

    for item2 in ingredients2:
        item2_tokens = set(token for token in item2)
        common_tokens = item1_tokens.intersection(item2_tokens)
        if common_tokens:
            matching_items[item1] = item2
            break

# timestamp4 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp4}")

for item, match in matching_items.items():
    print(f"{item} matches {match}")

In [32]:
from fuzzywuzzy import fuzz, process

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ingr = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items_list1 = {}
matching_items_list2 = {}

# Calculate all similarity scores and store the best match for each item in both lists
for item1 in ingredients1:
    best_match, best_score = None, 0

    for item2 in ingredients2:
        score = fuzz.token_sort_ratio(item1, item2)

        if score > best_score:
            best_match = item2
            best_score = score

    if best_score >= 10:  # You can adjust the threshold as needed
        matching_items_list1[item1] = best_match
        matching_items_list2[best_match] = item1

# Print the matched items for list1
print("Matches for list1:")
for item1, item2 in matching_items_list1.items():
    print(f"{item1} matches {item2}")

# Print the matched items for list2
print("\nMatches for list2:")
for item2, item1 in matching_items_list2.items():
    print(f"{item2} matches {item1}")

Matches for list1:
brussels sprouts, raw matches fresh brussels sprout
vinegar, red wine matches sherry wine vinegar
mustard, prepared, yellow matches mustard
syrup, maple, canadian matches maple syrup
oil, olive, salad or cooking matches walnut oil
spices, nutmeg, ground matches fresh nutmeg
nuts, walnuts, english matches walnut oil

Matches for list2:
fresh brussels sprout matches brussels sprouts, raw
sherry wine vinegar matches vinegar, red wine
mustard matches mustard, prepared, yellow
maple syrup matches syrup, maple, canadian
walnut oil matches nuts, walnuts, english
fresh nutmeg matches spices, nutmeg, ground


In [33]:
# try matching based only over word overlap

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ingr = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}

# Calculate all similarity scores and store the best match for each item in both lists
for item1 in ingredients1:
    best_match, best_score = None, 0

    for item2 in ingredients2:
        words = item2.split(" ")
        for word in words:
            if word in item1:
                #increase counter
        # add the match which the highest counter


SyntaxError: incomplete input (3765032402.py, line 23)