In [2]:
import os
import pickle
import pandas as pd
import json
import pprint

from calc_recipe_ingredient_info_distances import collectSomeRecipeRecommendations, get_all_comments, get_all_mutual_info, evalRecommendations, getRecommendationsBasedOnMutualInformationRole, get_graph_nodes, get_all_gt_recipes, get_recipes_per_ingredient, get_recipes_per_ingredient_pairs, get_all_frequencies,getNaiveBayesRecommendations

from recipe_clustering_helpers import (
    create_one_hot_ingredients_per_df, getRecipeIdsForSubTuples, get_hamming_distances
)

In [3]:
ORDERED_RECIPE_IDS_PATH = os.path.abspath("./outputs/sorted_recipe_ids_list.pkl")
TRAIN_COMMENTS_PATH = os.path.abspath("./inputs/train_comments_subs.pkl") # train recipes with substitutions
TEST_COMMENTS_PATH = os.path.abspath("./inputs/test_comments_subs.pkl") # test recipes with substitutions
VAL_COMMENTS_PATH = os.path.abspath("./inputs/val_comments_subs.pkl") # validation recipes with substitutions
GRAPH_NODES_PATH = os.path.abspath("./inputs/graph/nodes_191120.csv")

MUTUAL_INFO_DICT_PATH = os.path.abspath("./outputs/mutual_info_dict_with_self_info.pkl")
RECIPES_PER_INGREDIENT_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_small.pkl"
)
RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_pairs_small.pkl"
)
PROCESSED_RECIPES_PATH = os.path.abspath("./outputs/processed_recipes.pkl")
PATH_ONE_HOT_RECIPE_INGREDIENTS = os.path.abspath("./outputs/one_hot_recipe_ingredients.pkl")

EXTENDED_RECIPES_PATH = os.path.abspath("./inputs/extended_recipes_with_instructions_and_titles.json")



In [4]:

if os.path.isfile(ORDERED_RECIPE_IDS_PATH):
    with open(ORDERED_RECIPE_IDS_PATH, "rb") as file:
        ordered_recipe_ids = pickle.load(file)
recipe_ids_with_ranks = ordered_recipe_ids
ordered_recipe_ids = [recipe[1] for recipe in recipe_ids_with_ranks]

# recipes with comments, lists, names, and gt_truths
with open(EXTENDED_RECIPES_PATH, 'r') as recipe_extended_with_original_info:
    extended_recipes = json.load(recipe_extended_with_original_info)
# extended_recipes = get_all_comments(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH, VAL_COMMENTS_PATH, EXTENDED_RECIPES_PATH)
recipes_extended_dict = {recipe["id"]:recipe for recipe in extended_recipes}


# recipe ingredient df one hot
# INGREDIENT_RECIPE_MATRIX_PATH = os.path.abspath("./outputs/ingredient_recipe_matrix.pkl")
# recipe_ingredient_df = get_recipe_ingredient_df( ingredients, recipes, recipes_per_ingredient, INGREDIENT_RECIPE_MATRIX_PATH)
# recipe_ingredient_df_bool = recipe_ingredient_df.astype(bool)

with open(MUTUAL_INFO_DICT_PATH, "rb") as mutual_info_dict_file:
            mutual_info_dict = pickle.load(mutual_info_dict_file)

ingredients = get_graph_nodes(GRAPH_NODES_PATH)

recipes = get_all_gt_recipes(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH,
                                VAL_COMMENTS_PATH, PROCESSED_RECIPES_PATH)
recipes_per_ingredient = get_recipes_per_ingredient(
    ingredients, recipes, RECIPES_PER_INGREDIENT_SMALL_PATH)

recipes_per_ingredient_pairs = get_recipes_per_ingredient_pairs(
    recipes_per_ingredient, RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH)
recipe_ingredient_counts, recipe_ingredient_pair_counts = get_all_frequencies(recipes_per_ingredient,
                                        recipes_per_ingredient_pairs)


# Checking user comments GT substitution pairs

## Which GT tuple ingredients are not represented as nodes?

In [5]:
all_gt_substitution_tuples = [recipe['subs'] for recipe in extended_recipes]
all_gt_substitution_tuples = sorted(all_gt_substitution_tuples, key=lambda x: x[0])
all_gt_substitution_tuples
print(f"number of GT substitution tuples from user comments: {len(all_gt_substitution_tuples)}")
ingredients_that_are_not_nodes = []
for sub_tuple in all_gt_substitution_tuples:
    if sub_tuple[0] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[0])
    if sub_tuple[1] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[1])
print(f"number of ingredients that are not nodes: {len(ingredients_that_are_not_nodes)}")
print(f"list of ingredients that are not nodes: {list(set(ingredients_that_are_not_nodes))}")

number of GT substitution tuples from user comments: 70520
number of ingredients that are not nodes: 2583
list of ingredients that are not nodes: ['chocolate_sandwich_style_cookies', 'chickpeas', 'pancakes', 'contadina_tomato_sauce', 'unbleached_flour', 'instant_chocolate_fudge_pudding', 'lemon_salt', 'multigrain_cereal', 'onion_dip_mix', 'saffron_rice_mix', 'fresh_bay_leaves', 'raclette_cheese', 'crushed_pineapple_in_juice', 'sugar_free_apricot_preserves', 'bittersweet_dark_chocolate', 'cinnamon_ice_cream', 'frozen_squash', 'frozen_broccoli', 'coarse_sea_salt', 'manioc_flour', 'cherry_preserves', 'coarse_salt', 'powdered_turmeric', 'celery_seeds', 'canned_pink_salmon', 'spinach_leaves', 'mint_syrup', 'green_onions', 'lingonberry_preserves', 'thyme_leaves', 'dry_crushed_red_pepper', 'canned_whole_cranberry_sauce', 'frozen_concentrated_lemonade', 'reduced_fat_mayonnaise', 'dried_mexican_oregano', 'amaretti_cookie', 'vegetable_salt', 'pecans', 'soy_sausage', 'castor_oil', 'coffee_syrup',

## How often do ground truth substitution tuples appear in recipes?

In [6]:
ground_truth_tuple_recipes = getRecipeIdsForSubTuples(extended_recipes)
ground_truth_tuple_counts = {gt_sub: len(recipe_list) for gt_sub, recipe_list in ground_truth_tuple_recipes.items()}
ground_truth_tuple_counts = dict(sorted(ground_truth_tuple_counts.items(), key=lambda item: item[1], reverse=True))


In [7]:
print(list(ground_truth_tuple_counts.items())[:5])
print(list(ground_truth_tuple_counts.items())[-5:])
#('ground_beef', 'ground_turkey'): 253,
#('vegetable_oil', 'applesauce'): 240,
#('heavy_cream', 'milk'): 219,
#('plain_yogurt', 'sour_cream'): 168,
#('butter', 'applesauce'): 133, # could be interesting because it will not always work
#('parsley', 'cilantro'): 132, # this too
#('salt', 'garlic_salt'): 115,

#('beef', 'boneless_skinless_chicken_breast'): 8,
#('beer', 'beef_broth'): 8,
#('frozen_spinach', 'broccoli'): 8,
#('red_wine', 'red_wine_vinegar'): 8,
#('lean_ground_beef', 'veggie_crumbles'): 8,
#('chicken', 'ground_turkey'): 2, # rare but should often fit, except the preparation could make it different?
#('carrot', 'green_bell_pepper'): 8,
#('mushroom', 'red_bell_pepper'): 7,


[(('butter', 'olive_oil'), 547), (('water', 'chicken_broth'), 412), (('walnut', 'pecan'), 392), (('sugar', 'honey'), 373), (('margarine', 'butter'), 349)]
[(('dried_peppermint', 'dried_marjoram'), 1), (('unsweetened_applesauce', 'low_fat_plain_yogurt'), 1), (('vegetable_stock', 'green_bean'), 1), (('diced_tomato', 'potato'), 1), (('dried_thyme', 'mixed_herb'), 1)]


In [8]:
print(ground_truth_tuple_recipes[('egg_noodle', 'macaroni_noodle')])
print(ground_truth_tuple_counts[('egg_noodle', 'macaroni_noodle')])

['32ac082c31', 'dd43625438', '0fe9a1105b', 'beb22afa59', '99ae52fa77', '5c7aeac637', 'f472881b2e', '4f73ae2f24']
8


In [9]:
one_hot_recipe_ingredients, failed_recipes = create_one_hot_ingredients_per_df(extended_recipes, ingredients, PATH_ONE_HOT_RECIPE_INGREDIENTS)
failed_recipes

AttributeError: Can't get attribute '_unpickle_block' on <module 'pandas._libs.internals' from 'c:\\Users\\homie\\Anaconda3\\lib\\site-packages\\pandas\\_libs\\internals.cp37-win_amd64.pyd'>

In [None]:
recipe_id = "ffb1c2b5ad"
distances_hamming = get_hamming_distances(recipe_id, one_hot_recipe_ingredients)


In [None]:
distances_hamming = list(set(distances_hamming))
distances_hamming = sorted(distances_hamming, key=lambda x: x[1])
distances_hamming

[('999d8c4b0d', 7),
 ('4b6a3ba513', 7),
 ('703bafb843', 8),
 ('3abd84b687', 8),
 ('3ce8bc4f90', 8),
 ('814df4eedd', 8),
 ('45a362206d', 8),
 ('1602aa6218', 8),
 ('b23374a4b9', 8),
 ('b26281c482', 8),
 ('48dc15499a', 8),
 ('9297c2871e', 8),
 ('d500143819', 8),
 ('06f24af2ac', 8),
 ('1bfacdd84b', 9),
 ('e1015b6199', 9),
 ('74c68c90da', 9),
 ('a5118277b6', 9),
 ('a8588618df', 9),
 ('79008d067d', 9),
 ('bef14a917a', 9),
 ('a3d78df746', 9),
 ('788aaab39a', 9),
 ('5ac33cedde', 9),
 ('b4d4181e59', 9),
 ('e84d6756ee', 9),
 ('6438193959', 9),
 ('2196780e17', 9),
 ('76c50a9298', 9),
 ('af0f5ab387', 9),
 ('80c61e32a8', 9),
 ('e0702bd6b5', 9),
 ('81efe2efbd', 9),
 ('8f55c987f3', 9),
 ('669a68b20b', 9),
 ('b17964c27e', 9),
 ('4b004edfad', 9),
 ('ac88507fc0', 9),
 ('0cb46cd1b6', 9),
 ('bfa5a44e0a', 9),
 ('10c95542ba', 9),
 ('5274a67915', 9),
 ('1ed61bae3f', 9),
 ('836b5e02d8', 9),
 ('ae4f8b7915', 9),
 ('f1d1c4dd5a', 9),
 ('e76ad6f09a', 9),
 ('0f7fa9360a', 9),
 ('1ac9b1f6fa', 9),
 ('9186f9c766', 9),


In [None]:
def getRecipeFromComments(recipeId, allComments):
    recipe = None
    for commentRecipe in allComments:
        gt_ingredient_found = False
        comment_recipe_id = commentRecipe['id']
        if recipeId == comment_recipe_id:
            if recipe is None:
                recipe = {}
                recipe["id"] = comment_recipe_id
                recipe["subs"] = []
                recipe["ingredients"] = commentRecipe["ingredients"]

            if recipe is not None:
                recipe["subs"].append(commentRecipe["subs"])

    return recipe

# def getRecipeIdsForGTSub(gt_sub, extended_recipes):
#     recipe_ids = []
#     for recipe in extended_recipes:
#         recipe_id = recipe["id"]
#         subs = recipe["subs"]
#         if isinstance(subs, list) and gt_sub in subs:
#             recipe_ids.append(recipe_id)
#         elif gt_sub == subs:
#             recipe_ids.append(recipe_id)
#     return recipe_ids

In [None]:
# sample_gt_sub = ('chicken', 'ground_turkey')
# sample_recipe_ids = getRecipeIdsForGTSub(sample_gt_sub, extended_recipes)
# sample_recipe_ids

In [None]:
# r1 = getRecipeFromComments("ffb1c2b5ad", extended_recipes)
# r2 = getRecipeFromComments("999d8c4b0d", extended_recipes)
# r3 = getRecipeFromComments("4b6a3ba513", extended_recipes)

print(json.dumps(recipes_extended_dict["ffb1c2b5ad"], indent=4))

{
    "id": "ffb1c2b5ad",
    "ingredients": [
        [
            "flour"
        ],
        [
            "puff_pastry_sheet",
            "puff_pastry_sheets"
        ],
        [
            "tuna"
        ],
        [
            "onion",
            "onions",
            "round_onion"
        ],
        [
            "celery"
        ],
        [
            "mayonnaise",
            "lemon_mayonnaise",
            "wasabi_mayonnaise",
            "canola_oil_mayonnaise"
        ],
        [
            "salt_and_pepper"
        ],
        [
            "american_cheese"
        ],
        [
            "egg_wash"
        ]
    ],
    "subs": [
        "onion",
        "scallion"
    ],
    "instructions": [
        "Preheat the oven to 375F",
        "On a floured surface, roll out puff pastry sheets into 12- by 18-inch rectangles.",
        "Slide them onto a baking pan and place in the refrigerator.",
        "Let them settle about one hour, until firm.",
        "While the 

## Check for some Common side ingredients

In [None]:
PASTA = ["pasta", "noodle"]
FLOUR = ["flour"]
RICE = ["rice"]
BREAD = ["bread"]
POTATO = ["potato"]
OIL = ["oil"]
SPICES = ["salt", "sugar", "celery"] #maybe pepper? maybe mustard? but they could miss many interesting recipes but we can remove pepper for pepper for sure
MISC = ["water", "egg", "baking_powder", "baking_soda", "juice", "extract", "seed", "mix", "syrup"]

def getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, common_side_ingredients, exclude_keywords, do_remove_if_tail_matches):
    matched_ingredients = []
    for ingredient in ingredients:
        for side_ingredient in common_side_ingredients:
            if len(exclude_keywords) < 1:
                if side_ingredient in ingredient and ingredient not in matched_ingredients:
                    matched_ingredients.append(ingredient)
            else:
                for exclude_keyword in exclude_keywords:
                    if side_ingredient in ingredient and ingredient not in matched_ingredients and exclude_keyword not in ingredient:
                        matched_ingredients.append(ingredient)
    return matched_ingredients

pasta_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, PASTA, ["sauce"])
flour_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, FLOUR, [])
rice_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, RICE, [])
bread_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, BREAD, [])
potato_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, POTATO, [])
oil_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, OIL, [])
spices_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, SPICES, [])
misc_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, MISC, [])


print(len(pasta_matched_ingredients))
print(len(flour_matched_ingredients))
print(len(rice_matched_ingredients))
print(len(bread_matched_ingredients))
print(len(potato_matched_ingredients))
print(len(oil_matched_ingredients))
print(len(spices_matched_ingredients))
print(len(misc_matched_ingredients))

106
95
106
128
64
105
198
317


# Generating Recipe Set
1. get extended recipes
1. Filter GT samples according to frequency of pairs, ingredients, baseline model prediction quality
1. Fitler recipes based on if the gt source is in the recipe (? This is more relevant for the extension of the recipe)
1. Filter based on trivial substitutions (but how?)
1. Filter based on assumed importance of the ingredient to the recipe (but how?)
1. Filter according to additional features, like substituting one food class with another or with the same food class


In [None]:
# from the list of the most common sub tuples, remove the ones with the most common ingredients and most common combined ingredient counts (for the least common tuples vice versa)

ingredient_frequency_theshold = 3

recipes_per_ingredient_filtered = {ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)>3}
filtered_out_uncommon_ingredients = list({ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)<=3}.keys())
recipe_ingredient_counts_filtered = {recipe_id: len(recipes) for recipe_id, recipes in recipes_per_ingredient_filtered.items()}
recipe_ingredient_counts_filtered = dict(sorted(recipe_ingredient_counts_filtered.items(), key=lambda item: item[1], reverse=True))
total_ingredient_count_filtered = len(recipe_ingredient_counts_filtered)
print(total_ingredient_count_filtered)

very_most_common_ingrs = list(recipe_ingredient_counts_filtered.keys())[:38]
print(f"Top most common ingredients: {very_most_common_ingrs}")
print(f"Filtered out uncommon ingredients: {filtered_out_uncommon_ingredients}")

3842
Top most common ingredients: ['salt', 'butter', 'onion', 'egg', 'sugar', 'garlic_clove', 'water', 'olive_oil', 'flour', 'milk', 'pepper', 'baking_powder', 'brown_sugar', 'baking_soda', 'all_purpose_flour', 'vegetable_oil', 'carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley']
Filtered out uncommon ingredients: ['100%_bran', '12_inch_pizza_crust', '2%_buttermilk', '2%_evaporated_milk', '2%_mozzarella_cheese', '6_inch_corn_tortilla', '85%_lean_ground_beef', '8_inch_flour_tortilla', '9"_pastry_pie_shell', '9"_unbaked_pie_shell', '96%_lean_ground_beef', '9_inch_baked_pie_crust', '9_inch_graham_cracker_crust', 'a.1._original_sauce', 'abalone', 'absinthe', 'absolut_citron_vodka', 'achiote', 'achiote_oil', 'achiote_paste', 'achiote_powder', 'acor

In [None]:
print("Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) \n")
print(len(ground_truth_tuple_counts.keys()), "\n")

trivial_subs = []
def checkTrivialSubs(sub_tuple):
    ingr_1, ingr_2 = sub_tuple
    if ingr_1 in ingr_2 or ingr_2 in ingr_1:
        trivial_subs.append(sub_tuple)
        return True
    else:
        return False

unimportant_subs = []
def checkUnimportantSource(sub_tuple, very_most_common_ingrs):
    ingr_1, ingr_2 = sub_tuple
    for very_common_ingr in very_most_common_ingrs:
        if very_common_ingr in ingr_1:
            unimportant_subs.append(sub_tuple)
            return True
        else:
            return False

esoteric_subs = []
def check_esotericSource(sub_tuple, filtered_out_uncommon_ingredients):
    ingr_1, ingr_2 = sub_tuple
    for esoteric_ingr in filtered_out_uncommon_ingredients:
        if esoteric_ingr in ingr_1:
            esoteric_subs.append(sub_tuple)
            return True
        else:
            return False

### we could also remove subst that partially overlap, but that potentially removes many interesting substitutions (like "beef roast - venison roast" maybe) but we could try

ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkTrivialSubs(gt_tuple)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkUnimportantSource(gt_tuple, very_most_common_ingrs)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not check_esotericSource(gt_tuple, filtered_out_uncommon_ingredients)}

print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in pasta_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in flour_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in rice_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in bread_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in potato_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in oil_matched_ingredients}
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in spices_matched_ingredients}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in spices_matched_ingredients} # throw away spicy tail
print(len(ground_truth_tuple_counts), "\n")
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in misc_matched_ingredients}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in misc_matched_ingredients} # throw away miscelanious tail
print(len(ground_truth_tuple_counts), "\n")

total_nr_sub_tuples = len(ground_truth_tuple_counts.keys())
print(f"Total number of remainingsubstitution tuples: {total_nr_sub_tuples}")
# print(f"nr of trivial substitutions {len(trivial_subs)}")
# print(f"nr of unimportant substitutions {len(unimportant_subs)}")

most_common_sub_tuples = list(ground_truth_tuple_counts.keys())[:total_nr_sub_tuples // 4]
least_common_sub_tuples = list(ground_truth_tuple_counts.keys())[total_nr_sub_tuples // 4 * 3:]
# print(len(most_common_sub_tuples))
# print(len(least_common_sub_tuples))

Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) 

21853 

21853 

21853 

21853 

21853 

21853 

21853 

21853 

21853 

21853 

Total number of remainingsubstitution tuples: 21853


In [None]:


most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[:total_ingredient_count_filtered // 4]
least_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[total_ingredient_count_filtered // 4 * 3:]

# make upper ban list until index 16
most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[16:total_ingredient_count_filtered // 4]
# limit occurence of lower edge ingredients to being required to occur in 10(?) recipes?
print(f"The most common remaining ingredients: {most_common_ingredients}")
print("\n")
print(f"The least common remaining ingredients{least_common_ingredients}")

The most common remaining ingredients: ['carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley', 'paprika', 'cornstarch', 'ground_beef', 'chili_powder', 'worcestershire_sauce', 'unsalted_butter', 'fresh_ground_black_pepper', 'walnut', 'parsley', 'ground_cinnamon', 'nutmeg', 'cayenne_pepper', 'lemon', 'margarine', 'bacon', 'extra_virgin_olive_oil', 'green_pepper', 'red_onion', 'ground_cumin', 'cheddar_cheese', 'granulated_sugar', 'bay_leaf', 'ground_black_pepper', 'raisin', 'pecan', 'boneless_skinless_chicken_breast', 'tomato_sauce', 'heavy_cream', 'banana', 'red_bell_pepper', 'zucchini', 'dried_oregano', 'cumin', 'mozzarella_cheese', 'canola_oil', 'dijon_mustard', 'kosher_salt', 'mushroom', 'tomato_paste', 'diced_tomato', 'chicken_stock', 'dry_par

In [None]:
# print(len(most_common_sub_tuples))
top_tuples_top_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in most_common_ingredients]
top_tuples_bot_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in least_common_ingredients]

bot_tuples_top_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in most_common_ingredients]
bot_tuples_bot_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in least_common_ingredients]

print(f"top tuples - top sources {len(top_tuples_top_sources)}")
print(f"top tuples - bot sources {len(top_tuples_bot_sources)}")
print(f"bot tuples - tot sources {len(bot_tuples_top_sources)}")
print(f"bot tuples - bot sources {len(bot_tuples_bot_sources)}")

top tuples - top sources 4320
top tuples - bot sources 54
bot tuples - tot sources 3351
bot tuples - bot sources 219


In [None]:
bot_tuples_bot_sources

[('margarita_mix', 'water'),
 ('bacon_piece', 'bacon_bit'),
 ('passata', 'diced_tomato'),
 ('veg_all', 'carrot'),
 ('hot_pork_sausage', 'turkey_sausage'),
 ('sweet_baking_chocolate', 'canola_oil'),
 ('hing', 'onion_powder'),
 ('fresh_savory', 'dried_oregano'),
 ('roasted_sweet_red_pepper', 'cherry_tomato'),
 ('dried_mint_flake', 'parsley_flake'),
 ('sponge_cake', 'chocolate_chip'),
 ('pretzel_twist', 'peanut'),
 ('dried_lavender_flower', 'tea'),
 ('chicken_legs_with_thigh', 'chicken_breast'),
 ('raw_peanut', 'pine_nut'),
 ('ghirardelli_semi_sweet_chocolate_chip', 'dried_cranberry'),
 ('ghirardelli_semi_sweet_chocolate_chip', 'raisin'),
 ('green_tea_bag', 'honey'),
 ('liquid_fruit_pectin', 'pineapple_juice'),
 ('peanut_sauce', 'hoisin_sauce'),
 ('clementine', 'stock'),
 ('sausage_meat', 'turkey_meatball'),
 ('baby_rocket', 'lemon_pepper'),
 ('chicken_quarter', 'whole_chicken'),
 ('gumdrop', 'roasted_peanut'),
 ('nutmeats', 'peach'),
 ('sweet_apple', 'onion'),
 ('miniature_pretzel', 'pec

In [None]:
bot_tuples_top_sources

[('crabmeat', 'baked_tofu'),
 ('dry_sherry', 'cooking_wine'),
 ('cucumber', 'brine'),
 ('raisin', 'coconut_flake'),
 ('semi_sweet_chocolate_chip', 'pumpkin_pie_spice'),
 ('orange_marmalade', 'peach_preserves'),
 ('great_northern_bean', 'dark_red_kidney_bean'),
 ('iceberg_lettuce', 'avocado'),
 ('grape_tomato', 'mushroom_cap'),
 ('fresh_ginger', 'ginger_garlic_paste'),
 ('carrot', 'frozen_vegetable'),
 ('greek_yogurt', 'maple_syrup'),
 ('red_pepper', 'monterey_jack_cheese'),
 ('italian_seasoning', 'bay_leaf'),
 ('ground_pork', 'crab'),
 ('white_bean', 'black_bean'),
 ('taco_seasoning_mix', 'ranch_salad_dressing'),
 ('liquid_smoke', 'chili_powder'),
 ('cooked_turkey', 'ground_turkey'),
 ('chicken_broth', 'beef_bone'),
 ('zucchini', 'kale'),
 ('hot_sauce', 'chili_powder'),
 ('frozen_pea', 'vegetable'),
 ('basil_leaf', 'oregano_leaf'),
 ('yellow_cake_mix', 'flour'),
 ('fresh_jalapeno_pepper', 'poblano_pepper'),
 ('plum_tomato', 'chili'),
 ('peanut', 'shredded_coconut'),
 ('evaporated_milk'

In [None]:
top_tuples_bot_sources

[('plum_jam', 'apricot_jam'),
 ('vegemite', 'marmite'),
 ('pepsi', 'coke'),
 ('lean_ground_chicken', 'ground_turkey'),
 ('yellow_hominy', 'corn'),
 ('vegetable_soup_mix', 'leek_soup_mix'),
 ('instant_coconut_cream_pudding_mix', 'vanilla'),
 ('unsweetened_raspberry', 'strawberry'),
 ('mild_taco_sauce', 'salsa'),
 ('venison_roast', 'beef_roast'),
 ('ground_chicken_breast', 'ground_turkey_breast'),
 ('unsweetened_sliced_peach', 'pear'),
 ('dairy_free_margarine', 'coconut_oil'),
 ('frozen_cheese_ravioli', 'tortellini'),
 ('deli_corned_beef', 'pastrami'),
 ('digestive_biscuit', 'graham_cracker'),
 ('veal_stock', 'beef_stock'),
 ('fiesta_nacho_cheese_soup', 'cheddar_cheese_soup'),
 ('ground_hazelnut', 'ground_almond'),
 ('english_walnut', 'pecan'),
 ('hing', 'garlic_powder'),
 ('non_fat_vanilla_frozen_yogurt', 'vanilla_ice_cream'),
 ('turkey_tenderloin', 'chicken_tenderloin'),
 ('orange_cake_mix', 'lemon'),
 ('turkey_ham', 'bacon'),
 ('dried_mint_flake', 'oregano_flake'),
 ('chocolate_cookie

In [13]:
recipe_ingredient_pair = ('hot_pork_sausage', 'turkey_sausage')
recipe_ids = recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')]
print(recipe_ids)

[]


In [22]:
print(recipes_per_ingredient_pairs[('1%_fat_buttermilk', '1%_fat_cottage_cheese')])
print(recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')])
print(recipes_per_ingredient_pairs[('cooked_ham', 'pork')])
print(recipes_per_ingredient_pairs[('pork', 'cooked_ham')])
# I should check that... if, for a recipe, there are gt subst for which not both ingredients are contained in the recipe ingredient set, then this must be filtered (and can also help maybe understanding some of the performance of the gismo model)

['d7c2a81e73', '8886e63259']
[]
[]


KeyError: ('pork', 'cooked_ham')