In [1]:
import os
import pickle
import pandas as pd
import json
import pprint
import random

from calc_recipe_ingredient_info_distances import collectSomeRecipeRecommendations, get_all_comments, get_all_mutual_info, evalRecommendations, getRecommendationsBasedOnMutualInformationRole, get_graph_nodes, get_all_gt_recipes, get_recipes_per_ingredient, get_recipes_per_ingredient_pairs, get_all_frequencies,getNaiveBayesRecommendations

from generic_preprocessing import (make_recipes_extended_dict)

from recipe_clustering_helpers import (
    create_one_hot_ingredients_per_df, getRecipeIdsForSubTuples, get_hamming_distances
)

In [2]:
ORDERED_RECIPE_IDS_PATH = os.path.abspath("./outputs/sorted_recipe_ids_list.pkl")
TRAIN_COMMENTS_PATH = os.path.abspath("./inputs/train_comments_subs.pkl") # train recipes with substitutions
TEST_COMMENTS_PATH = os.path.abspath("./inputs/test_comments_subs.pkl") # test recipes with substitutions
VAL_COMMENTS_PATH = os.path.abspath("./inputs/val_comments_subs.pkl") # validation recipes with substitutions
GRAPH_NODES_PATH = os.path.abspath("./inputs/graph/nodes_191120.csv")

MUTUAL_INFO_DICT_PATH = os.path.abspath("./outputs/mutual_info_dict_with_self_info.pkl")
RECIPES_PER_INGREDIENT_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_small.pkl"
)
RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_pairs_small.pkl"
)
PROCESSED_RECIPES_PATH = os.path.abspath("./outputs/processed_recipes.pkl")
PATH_ONE_HOT_RECIPE_INGREDIENTS = os.path.abspath("./outputs/one_hot_recipe_ingredients.pkl")

EXTENDED_RECIPES_PATH_OLD = os.path.abspath("./inputs/extended_recipes_with_instructions_and_titles.json")

EXTENDED_RECIPES_PATH = os.path.abspath("./inputs/extended_recipes_with_quantities.json")


SURVEY_COMPLETE_SUB_TUPLE_AND_RECIPE_SET_PATH = os.path.abspath("./outputs/suvey_all_subs_and_recipes.json")

SURVEY_QUESTION_SET_A_PATH = os.path.abspath("./outputs/suvey_question_set_a.json")
SURVEY_QUESTION_SET_B_PATH = os.path.abspath("./outputs/suvey_question_set_b.json")
SURVEY_QUESTION_500_SET_PATH = os.path.abspath("./outputs/suvey_question_set_500.json")


In [3]:

if os.path.isfile(ORDERED_RECIPE_IDS_PATH):
    with open(ORDERED_RECIPE_IDS_PATH, "rb") as file:
        ordered_recipe_ids = pickle.load(file)
recipe_ids_with_ranks = ordered_recipe_ids
ordered_recipe_ids = [recipe[1] for recipe in recipe_ids_with_ranks]

# recipes with comments, lists, names, and gt_truths
with open(EXTENDED_RECIPES_PATH, 'r') as recipe_extended_with_original_info:
    extended_recipes = json.load(recipe_extended_with_original_info)

recipes_extended_dict = make_recipes_extended_dict(extended_recipes)



# recipe ingredient df one hot
# INGREDIENT_RECIPE_MATRIX_PATH = os.path.abspath("./outputs/ingredient_recipe_matrix.pkl")
# recipe_ingredient_df = get_recipe_ingredient_df( ingredients, recipes, recipes_per_ingredient, INGREDIENT_RECIPE_MATRIX_PATH)
# recipe_ingredient_df_bool = recipe_ingredient_df.astype(bool)

with open(MUTUAL_INFO_DICT_PATH, "rb") as mutual_info_dict_file:
            mutual_info_dict = pickle.load(mutual_info_dict_file)

ingredients = get_graph_nodes(GRAPH_NODES_PATH)

recipes = get_all_gt_recipes(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH,
                                VAL_COMMENTS_PATH, PROCESSED_RECIPES_PATH)
recipes_per_ingredient = get_recipes_per_ingredient(
    ingredients, recipes, RECIPES_PER_INGREDIENT_SMALL_PATH)

recipes_per_ingredient_pairs = get_recipes_per_ingredient_pairs(
    recipes_per_ingredient, RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH)
recipe_ingredient_counts, recipe_ingredient_pair_counts = get_all_frequencies(recipes_per_ingredient,
                                        recipes_per_ingredient_pairs)


In [4]:
print(len(extended_recipes))
print(len(recipes_extended_dict))

70520
44429


# Checking user comments GT substitution pairs

## Which GT tuple ingredients are not represented as nodes?

In [5]:
all_gt_substitution_tuples = [recipe['subs'] for recipe in extended_recipes]
all_gt_substitution_tuples = sorted(all_gt_substitution_tuples, key=lambda x: x[0][0])
all_gt_substitution_tuples
print(f"number of GT substitution tuples from user comments: {len(all_gt_substitution_tuples)}")
ingredients_that_are_not_nodes = []
for sub_tuple in all_gt_substitution_tuples:
    if sub_tuple[0] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[0])
    if sub_tuple[1] not in ingredients:
        ingredients_that_are_not_nodes.append(sub_tuple[1])
print(f"number of ingredients that are not nodes: {len(ingredients_that_are_not_nodes)}")
print(f"list of ingredients that are not nodes: {list(set(ingredients_that_are_not_nodes))}")

number of GT substitution tuples from user comments: 70520
number of ingredients that are not nodes: 2583
list of ingredients that are not nodes: ['ground_cilantro', 'mixed_glace_fruit', 'frozen_concentrated_lemonade', 'frozen_salad_shrimp', 'frozen_broccoli_florets', 'frozen_margarita_mix', 'salt_free_garlic_powder', 'pollock_fillet', 'coarse_ground_mustard', 'fresh_basil_leaves', 'breaded_chicken_breast', 'cream_soda', 'whole_grain_rice', 'frozen_peas_and_corn', 'peanut_butter_cookie', 'tuna_salad', 'vanilla_wafer_cookie', 'berry_yogurt', 'cabbage_leaves', 'cherries_in_juice', 'cranberry_vodka', 'red_jalapeno_jelly', 'saltines', 'invert_sugar', 'butterscotch_candy', 'coarse_black_pepper', 'low_fat_creme_fraiche', 'fat_free_sour_cream', 'non_fat_cooking_spray', 'lite_firm_tofu', 'kamut', 'lemon_infused_olive_oil', 'light_sour_cream', 'roasted_sunflower_seeds', 'onion_dip_mix', 'eggs', 'pork_rib_roast', 'butter_flavored_cracker_crumbs', 'raspberry_preserves', 'frozen_broccoli_and_cauli

## How often do ground truth substitution tuples appear in recipes?

In [6]:
ground_truth_tuple_recipes = getRecipeIdsForSubTuples(extended_recipes)
ground_truth_tuple_counts = {gt_sub: len(recipe_list) for gt_sub, recipe_list in ground_truth_tuple_recipes.items()}
ground_truth_tuple_counts = dict(sorted(ground_truth_tuple_counts.items(), key=lambda item: item[1], reverse=True))


In [7]:
print(list(ground_truth_tuple_counts.items())[:15])
ground_truth_tuple_counts_sorted = sorted(ground_truth_tuple_counts.items(), key=lambda x:x[1], reverse=True)
print(ground_truth_tuple_counts_sorted[:15])
# print(list(ground_truth_tuple_counts.items())[-5:])

#('ground_beef', 'ground_turkey'): 253,
#('vegetable_oil', 'applesauce'): 240,
#('heavy_cream', 'milk'): 219,
#('plain_yogurt', 'sour_cream'): 168,
#('butter', 'applesauce'): 133, # could be interesting because it will not always work
#('parsley', 'cilantro'): 132, # this too
#('salt', 'garlic_salt'): 115,

#('beef', 'boneless_skinless_chicken_breast'): 8,
#('beer', 'beef_broth'): 8,
#('frozen_spinach', 'broccoli'): 8,
#('red_wine', 'red_wine_vinegar'): 8,
#('lean_ground_beef', 'veggie_crumbles'): 8,
#('chicken', 'ground_turkey'): 2, # rare but should often fit, except the preparation could make it different?
#('carrot', 'green_bell_pepper'): 8,
#('mushroom', 'red_bell_pepper'): 7,


[(('butter', 'olive_oil'), 547), (('water', 'chicken_broth'), 412), (('walnut', 'pecan'), 392), (('sugar', 'honey'), 373), (('margarine', 'butter'), 349), (('butter', 'margarine'), 320), (('pecan', 'walnut'), 272), (('vegetable_oil', 'applesauce'), 239), (('ground_beef', 'ground_turkey'), 230), (('egg', 'egg_white'), 222), (('onion', 'onion_powder'), 213), (('sour_cream', 'yogurt'), 212), (('water', 'beef_broth'), 200), (('water', 'milk'), 199), (('water', 'chicken_stock'), 194)]
[(('butter', 'olive_oil'), 547), (('water', 'chicken_broth'), 412), (('walnut', 'pecan'), 392), (('sugar', 'honey'), 373), (('margarine', 'butter'), 349), (('butter', 'margarine'), 320), (('pecan', 'walnut'), 272), (('vegetable_oil', 'applesauce'), 239), (('ground_beef', 'ground_turkey'), 230), (('egg', 'egg_white'), 222), (('onion', 'onion_powder'), 213), (('sour_cream', 'yogurt'), 212), (('water', 'beef_broth'), 200), (('water', 'milk'), 199), (('water', 'chicken_stock'), 194)]


In [8]:
print(ground_truth_tuple_recipes[('egg_noodle', 'macaroni_noodle')])
print(ground_truth_tuple_counts[('egg_noodle', 'macaroni_noodle')])

['32ac082c31', 'dd43625438', '0fe9a1105b', 'beb22afa59', '99ae52fa77', '5c7aeac637', 'f472881b2e', '4f73ae2f24']
8


In [9]:
one_hot_recipe_ingredients, failed_recipes = create_one_hot_ingredients_per_df(extended_recipes, ingredients, PATH_ONE_HOT_RECIPE_INGREDIENTS)
failed_recipes

[]

In [10]:
# recipe_id = "ffb1c2b5ad"
# distances_hamming = get_hamming_distances(recipe_id, one_hot_recipe_ingredients)


In [11]:
# distances_hamming = list(set(distances_hamming))
# distances_hamming = sorted(distances_hamming, key=lambda x: x[1])
# distances_hamming

In [10]:
def getRecipeFromComments(recipeId, allComments):
    recipe = None
    for commentRecipe in allComments:
        gt_ingredient_found = False
        comment_recipe_id = commentRecipe['id']
        if recipeId == comment_recipe_id:
            if recipe is None:
                recipe = {}
                recipe["id"] = comment_recipe_id
                recipe["subs"] = []
                recipe["ingredients"] = commentRecipe["ingredients"]

            if recipe is not None:
                recipe["subs"].append(commentRecipe["subs"])

    return recipe

# def getRecipeIdsForGTSub(gt_sub, extended_recipes):
#     recipe_ids = []
#     for recipe in extended_recipes:
#         recipe_id = recipe["id"]
#         subs = recipe["subs"]
#         if isinstance(subs, list) and gt_sub in subs:
#             recipe_ids.append(recipe_id)
#         elif gt_sub == subs:
#             recipe_ids.append(recipe_id)
#     return recipe_ids

In [13]:
# sample_gt_sub = ('chicken', 'ground_turkey')
# sample_recipe_ids = getRecipeIdsForGTSub(sample_gt_sub, extended_recipes)
# sample_recipe_ids

In [11]:
# r1 = getRecipeFromComments("ffb1c2b5ad", extended_recipes)
# r2 = getRecipeFromComments("999d8c4b0d", extended_recipes)
# r3 = getRecipeFromComments("4b6a3ba513", extended_recipes)

print(json.dumps(recipes_extended_dict["ffb1c2b5ad"], indent=4))

{
    "id": "ffb1c2b5ad",
    "ingredients": [
        [
            "flour"
        ],
        [
            "puff_pastry_sheet",
            "puff_pastry_sheets"
        ],
        [
            "tuna"
        ],
        [
            "onion",
            "onions",
            "round_onion"
        ],
        [
            "celery"
        ],
        [
            "mayonnaise",
            "lemon_mayonnaise",
            "wasabi_mayonnaise",
            "canola_oil_mayonnaise"
        ],
        [
            "salt_and_pepper"
        ],
        [
            "american_cheese"
        ],
        [
            "egg_wash"
        ]
    ],
    "subs": [
        "onion",
        "scallion"
    ],
    "instructions": [
        "Preheat the oven to 375F",
        "On a floured surface, roll out puff pastry sheets into 12- by 18-inch rectangles.",
        "Slide them onto a baking pan and place in the refrigerator.",
        "Let them settle about one hour, until firm.",
        "While the 

## Check for some Common side ingredients

### set up filter phrase categories

In [12]:
PASTA = ["pasta", "noodle"]
FLOUR = ["flour", "wheat", "starch"]
RICE = ["rice"]
BREAD = ["bread"]
POTATO = ["potato"]
OIL = ["oil"]
OIL = ["oil", "butter", "cream", "margarine"]
SPICES = ["salt", "sugar", "celery"] #maybe pepper? maybe mustard? but they could miss many interesting recipes but we can remove pepper for pepper for sure
MISC = ["water", "egg", "baking_powder", "baking_soda", "juice", "extract", "seed", "mix", "syrup", "milk", "shallot", "onion"]

BLACKLIST_PHRASES = [*PASTA, *FLOUR, *RICE, *BREAD, *POTATO, *OIL, *SPICES, *MISC]
print(f"blacklist phrases: {BLACKLIST_PHRASES}")

def getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, common_side_ingredients, exclude_keywords):
    matched_ingredients = []
    for ingredient in ingredients:
        for side_ingredient in common_side_ingredients:
            if len(exclude_keywords) < 1:
                if side_ingredient in ingredient and ingredient not in matched_ingredients:
                    matched_ingredients.append(ingredient)
            else:
                for exclude_keyword in exclude_keywords:
                    if side_ingredient in ingredient and ingredient not in matched_ingredients and exclude_keyword not in ingredient:
                        matched_ingredients.append(ingredient)
    return matched_ingredients

def checkIngredientContainsBlacklistPhrase(ingredient, blacklist_phrases, whitelist_phrases = []):
    for blacklist_phrase in blacklist_phrases:
        if blacklist_phrase in ingredient:
            for whitelist_phrase in whitelist_phrases:
                if whitelist_phrase in ingredient:
                    return False
            return True
    return False

checked_for_sugga = checkIngredientContainsBlacklistPhrase("sugar, granulated", BLACKLIST_PHRASES)
print(f"Did granulated sugar get blacklisted? {checked_for_sugga}")
checked_for_steve = checkIngredientContainsBlacklistPhrase("jobs, steve", BLACKLIST_PHRASES)
print(f"Did steve jobs get blacklisted? {checked_for_steve}")

pasta_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, PASTA, ["sauce"])
flour_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, FLOUR, [])
rice_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, RICE, [])
bread_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, BREAD, [])
potato_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, POTATO, [])
oil_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, OIL, [])
spices_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, SPICES, [])
misc_matched_ingredients = getAllSpecificIngredientNamesForCommonSideIngredient(ingredients, MISC, [])


print(len(pasta_matched_ingredients))
print(len(flour_matched_ingredients))
print(len(rice_matched_ingredients))
print(len(bread_matched_ingredients))
print(len(potato_matched_ingredients))
print(len(oil_matched_ingredients))
print(len(spices_matched_ingredients))
print(len(misc_matched_ingredients))

blacklist phrases: ['pasta', 'noodle', 'flour', 'wheat', 'starch', 'rice', 'bread', 'potato', 'oil', 'butter', 'cream', 'margarine', 'salt', 'sugar', 'celery', 'water', 'egg', 'baking_powder', 'baking_soda', 'juice', 'extract', 'seed', 'mix', 'syrup', 'milk', 'shallot', 'onion']
Did granulated sugar get blacklisted? True
Did steve jobs get blacklisted? False
106
155
106
128
64
366
198
770


# Generating Recipe Set
1. get extended recipes
1. Filter GT samples according to frequency of pairs, ingredients, baseline model prediction quality
1. Fitler recipes based on if the gt source is in the recipe (? This is more relevant for the extension of the recipe)
1. Filter based on trivial substitutions (but how?)
1. Filter based on assumed importance of the ingredient to the recipe (but how?)
1. Filter according to additional features, like substituting one food class with another or with the same food class


In [13]:
# from the list of the most common sub tuples, remove the ones with the most common ingredients and most common combined ingredient counts (for the least common tuples vice versa)

ingredient_frequency_theshold = 3

recipes_per_ingredient_filtered = {ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)>3}
filtered_out_uncommon_ingredients = list({ingredient_name: recipes for ingredient_name, recipes in recipes_per_ingredient.items() if len(recipes)<=3}.keys())
recipe_ingredient_counts_filtered = {recipe_id: len(recipes) for recipe_id, recipes in recipes_per_ingredient_filtered.items()}
recipe_ingredient_counts_filtered = dict(sorted(recipe_ingredient_counts_filtered.items(), key=lambda item: item[1], reverse=True))
total_ingredient_count_filtered = len(recipe_ingredient_counts_filtered)
print(total_ingredient_count_filtered)

very_most_common_ingrs = list(recipe_ingredient_counts_filtered.keys())[:38]
print(f"Top most common ingredients: {very_most_common_ingrs}")
print(f"Filtered out uncommon ingredients: {filtered_out_uncommon_ingredients}")

3842
Top most common ingredients: ['salt', 'butter', 'onion', 'egg', 'sugar', 'garlic_clove', 'water', 'olive_oil', 'flour', 'milk', 'pepper', 'baking_powder', 'brown_sugar', 'baking_soda', 'all_purpose_flour', 'vegetable_oil', 'carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley']
Filtered out uncommon ingredients: ['100%_bran', '12_inch_pizza_crust', '2%_buttermilk', '2%_evaporated_milk', '2%_mozzarella_cheese', '6_inch_corn_tortilla', '85%_lean_ground_beef', '8_inch_flour_tortilla', '9"_pastry_pie_shell', '9"_unbaked_pie_shell', '96%_lean_ground_beef', '9_inch_baked_pie_crust', '9_inch_graham_cracker_crust', 'a.1._original_sauce', 'abalone', 'absinthe', 'absolut_citron_vodka', 'achiote', 'achiote_oil', 'achiote_paste', 'achiote_powder', 'acor

In [14]:
print("Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) \n")
print(len(ground_truth_tuple_counts.keys()), "\n")

trivial_subs = []
def checkTrivialSubs(sub_tuple):
    ingr_1, ingr_2 = sub_tuple
    if ingr_1 in ingr_2 or ingr_2 in ingr_1:
        trivial_subs.append(sub_tuple)
        return True
    else:
        return False

unimportant_subs = []
def checkUnimportantSource(sub_tuple, very_most_common_ingrs):
    ingr_1, ingr_2 = sub_tuple
    for very_common_ingr in very_most_common_ingrs:
        if very_common_ingr in ingr_1:
            unimportant_subs.append(sub_tuple)
            return True
        else:
            return False

esoteric_subs = []
def check_esotericSource(sub_tuple, filtered_out_uncommon_ingredients):
    ingr_1, ingr_2 = sub_tuple
    for esoteric_ingr in filtered_out_uncommon_ingredients:
        if esoteric_ingr in ingr_1:
            esoteric_subs.append(sub_tuple)
            return True
        else:
            return False

### we could also remove subst that partially overlap, but that potentially removes many interesting substitutions (like "beef roast - venison roast" maybe) but we could try

ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkTrivialSubs(gt_tuple)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not checkUnimportantSource(gt_tuple, very_most_common_ingrs)}
ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not check_esotericSource(gt_tuple, filtered_out_uncommon_ingredients)}


print(len(ground_truth_tuple_counts), "\n")
INCLUDE_FILTER_INGREDIENTS = True
if not INCLUDE_FILTER_INGREDIENTS:
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in pasta_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in flour_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in rice_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in bread_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in potato_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in oil_matched_ingredients}
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in spices_matched_ingredients}
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in spices_matched_ingredients} # throw away spicy tail
    print(len(ground_truth_tuple_counts), "\n")
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[0] in misc_matched_ingredients}
    ground_truth_tuple_counts = {gt_tuple: recipe_count for gt_tuple, recipe_count in ground_truth_tuple_counts.items() if not gt_tuple[1] in misc_matched_ingredients} # throw away miscelanious tail
print(len(ground_truth_tuple_counts), "\n")

total_nr_sub_tuples = len(ground_truth_tuple_counts.keys())
print(f"Total number of remainingsubstitution tuples: {total_nr_sub_tuples}")
# print(f"nr of trivial substitutions {len(trivial_subs)}")
# print(f"nr of unimportant substitutions {len(unimportant_subs)}")

most_common_sub_tuples = list(ground_truth_tuple_counts.keys())[:total_nr_sub_tuples // 4]
least_common_sub_tuples = list(ground_truth_tuple_counts.keys())[total_nr_sub_tuples // 4 * 3:]
# print(len(most_common_sub_tuples))
# print(len(least_common_sub_tuples))

Number of distinct ground truth substitution tuples (prints after applying another filter (inline)) 

30832 

29410 

29410 

Total number of remainingsubstitution tuples: 29410


## get most and least common ingredients

In [15]:


most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[:total_ingredient_count_filtered // 4]
least_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[total_ingredient_count_filtered // 4 * 3:]

# make upper ban list until index 16
most_common_ingredients = list(recipe_ingredient_counts_filtered.keys())[16:total_ingredient_count_filtered // 4]
# limit occurence of lower edge ingredients to being required to occur in 10(?) recipes?
print(f"The most common remaining ingredients: {most_common_ingredients}")
print("\n")
print(f"The least common remaining ingredients{least_common_ingredients}")

The most common remaining ingredients: ['carrot', 'parmesan_cheese', 'salt_and_pepper', 'vanilla', 'cinnamon', 'sour_cream', 'black_pepper', 'green_onion', 'lemon_juice', 'celery', 'garlic_powder', 'tomato', 'oil', 'vanilla_extract', 'garlic', 'chicken_broth', 'soy_sauce', 'honey', 'potato', 'cream_cheese', 'mayonnaise', 'fresh_parsley', 'paprika', 'cornstarch', 'ground_beef', 'chili_powder', 'worcestershire_sauce', 'unsalted_butter', 'fresh_ground_black_pepper', 'walnut', 'parsley', 'ground_cinnamon', 'nutmeg', 'cayenne_pepper', 'lemon', 'margarine', 'bacon', 'extra_virgin_olive_oil', 'green_pepper', 'red_onion', 'ground_cumin', 'cheddar_cheese', 'granulated_sugar', 'bay_leaf', 'ground_black_pepper', 'raisin', 'pecan', 'boneless_skinless_chicken_breast', 'tomato_sauce', 'heavy_cream', 'banana', 'red_bell_pepper', 'zucchini', 'dried_oregano', 'cumin', 'mozzarella_cheese', 'canola_oil', 'dijon_mustard', 'kosher_salt', 'mushroom', 'tomato_paste', 'diced_tomato', 'chicken_stock', 'dry_par

In [16]:
# print(len(most_common_sub_tuples))
top_tuples_top_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in most_common_ingredients]
top_tuples_bot_sources = [common_tuple for common_tuple in most_common_sub_tuples if common_tuple[0] in least_common_ingredients]

bot_tuples_top_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in most_common_ingredients]
bot_tuples_bot_sources = [common_tuple for common_tuple in least_common_sub_tuples if common_tuple[0] in least_common_ingredients]

print(f"top tuples - top sources {len(top_tuples_top_sources)}")
print(f"top tuples - bot sources {len(top_tuples_bot_sources)}")
print(f"bot tuples - tot sources {len(bot_tuples_top_sources)}")
print(f"bot tuples - bot sources {len(bot_tuples_bot_sources)}")

top tuples - top sources 5431
top tuples - bot sources 61
bot tuples - tot sources 4403
bot tuples - bot sources 290


In [17]:
bot_tuples_bot_sources[:5]

[('chicken_leg_quarter', 'smoked_salmon'),
 ('beef_top_round_steak', 'seasoning'),
 ('cream_soup', 'cream_of_mushroom_soup'),
 ('frozen_sweetened_strawberry', 'orange_slice'),
 ('fresh_udon_noodle', 'spaghetti_noodle')]

In [18]:
bot_tuples_top_sources[:5]

[('lean_ground_beef', 'rice'),
 ('cream_of_mushroom_soup', 'cream_of_shrimp_soup'),
 ('whipping_cream', 'cream_cheese'),
 ('celery', 'grape'),
 ('red_lentil', 'red_bean')]

In [19]:
top_tuples_bot_sources[:5]

[('plum_jam', 'apricot_jam'),
 ('vegemite', 'marmite'),
 ('pepsi', 'coke'),
 ('lean_ground_chicken', 'ground_turkey'),
 ('yellow_hominy', 'corn')]

In [20]:
recipe_ingredient_pair = ('hot_pork_sausage', 'turkey_sausage')
recipe_ids = recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')]
print(recipe_ids)

[]


In [21]:
print(recipes_per_ingredient_pairs[('1%_fat_buttermilk', '1%_fat_cottage_cheese')])
print(recipes_per_ingredient_pairs[('hot_pork_sausage', 'turkey_sausage')])
print(recipes_per_ingredient_pairs[('cooked_ham', 'pork')])
# print(recipes_per_ingredient_pairs[('pork', 'cooked_ham')])
# I should check that... if, for a recipe, there are gt subst for which not both ingredients are contained in the recipe ingredient set, then this must be filtered (and can also help maybe understanding some of the performance of the gismo model)

['d7c2a81e73', '8886e63259']
[]
[]


## Get recipes where the sub source is one of the ingredients with the highest weight 

In [22]:
def getAbsoluteRankOfIngredient(ingredient, recipe):
    ingredient_quantities = recipe["ingredient_quantities"]
    # sort the quantities dict
    ingredient_quantities = dict(sorted(ingredient_quantities.items(), key=lambda item: item[1], reverse=True))

    # get the 1m name from recipe
    for _ingr_gismo, _ingr_1m in zip(recipe["ingredients"], recipe["original_ingredients"]):
        is_found = False
        if isinstance(_ingr_gismo, list):
            for _ingr_var_gismo in _ingr_gismo:
                if _ingr_var_gismo == ingredient:
                    ingredient = _ingr_1m
                    is_found = True
                    break
        else:
            if _ingr_gismo == ingredient:
                ingredient = _ingr_1m
                break
        if is_found:
            break


    rank = len(ingredient_quantities)
    # get placement of ingredient
    for i, ingr_with_quantity  in enumerate(list(ingredient_quantities.keys())):
        if ingr_with_quantity == ingredient:
            rank = i + 1

    return rank

def getRelativeRankOfIngredient(ingredient, recipe):
    absolute_rank = getAbsoluteRankOfIngredient(ingredient, recipe)
    relative_rank = absolute_rank / len(recipe["ingredients"])
    return relative_rank

# if relative rank <= 0.25:
#  add the gismo recipe and the gt sample to the list of candidate recipe substitutions

rec = recipes_extended_dict["ffb1c2b5ad"]
sub = rec["subs_collection"][0]
source = sub[0]
source_rank = getAbsoluteRankOfIngredient(source, rec)
print(sub)
print(rec["ingredient_quantities"])
print(source_rank)

['onion', 'scallion']
{'2 puff pastry sheets': 2.0, '2 (6 ounce) cans tuna (I used salmon and crab before)': 56.699, '2 tablespoons chopped onions': 29.5736, '2 tablespoons chopped celery': 29.5736, '12 cup mayonnaise': 118.294, '4 slices American cheese, cut in half': 4.0}
3


In [23]:
def getSubRecipeCouplingsWhereSourceIsNthMostHeavyIngr(n, recipes_extended_dict, relative = False):
    """Creates a dict with substitution touple keys to a list of extended_recipes as values.

    Args:
        n (_type_): nth weight rank which should be achieved by the substitution source.
        recipes_extended_dict (_type_): _description_
    """
    recipes_to_substitution_tuple_candidates = {}
    for __build_class__recipe_id, recipe in list(recipes_extended_dict.items()):
        subs = recipe["subs_collection"]
        for sub in subs:
            if not isinstance(sub, list):
                assert(False)
            source = sub[0]
            target = sub[1]
            sub_pair = (source, target)
            if relative == False:
                source_rank = getAbsoluteRankOfIngredient(source, recipe)
            else: source_rank = getRelativeRankOfIngredient(source, recipe)
            if source_rank <= n:
                if sub_pair not in list(recipes_to_substitution_tuple_candidates.keys()):
                    recipes_to_substitution_tuple_candidates[sub_pair] = []
                recipes_to_substitution_tuple_candidates[sub_pair].append(recipe)
    return recipes_to_substitution_tuple_candidates

In [24]:
n = 3

recipes_to_substitution_tuple_candidates = getSubRecipeCouplingsWhereSourceIsNthMostHeavyIngr(n, recipes_extended_dict)

pass

### filter the sub-pair - recipes-list candidates

In [25]:
# do some post filtering on the candidtate tuples

# filter the remaining recipes with nutri info in which the gt source plays a major role
recipes_to_substitution_tuple_candidates = {sub_tuple: recipe_list for sub_tuple, recipe_list in list(recipes_to_substitution_tuple_candidates.items()) if not checkIngredientContainsBlacklistPhrase(sub_tuple[0], BLACKLIST_PHRASES)}


# and filter out the very most common ingredients from the gismo dataset (should filter target as well?)
recipes_to_substitution_tuple_candidates = {sub_tuple: recipe_list for sub_tuple, recipe_list in list(recipes_to_substitution_tuple_candidates.items()) if not sub_tuple[0] in very_most_common_ingrs}

# filter the very least common ingredients (very few ingredients may have a hard time in inference, do for source and target)
recipes_to_substitution_tuple_candidates = {sub_tuple: recipe_list for sub_tuple, recipe_list in list(recipes_to_substitution_tuple_candidates.items()) if not sub_tuple[0] in least_common_ingredients}
recipes_to_substitution_tuple_candidates = {sub_tuple: recipe_list for sub_tuple, recipe_list in list(recipes_to_substitution_tuple_candidates.items()) if not sub_tuple[1] in least_common_ingredients}

# filter out very short recipes
for sub_pair, recipe_list in list(recipes_to_substitution_tuple_candidates.items()):
    recipes_to_substitution_tuple_candidates[sub_pair] = [recipe for recipe in recipe_list if len(recipe["ingredients"])>=3]

# filter tuples such that we have at least 3 recipes to select from
recipes_to_substitution_tuple_candidates = {sub_tuple: recipe_list for sub_tuple, recipe_list in list(recipes_to_substitution_tuple_candidates.items()) if len(recipe_list)>=3}



In [29]:
print(f"number of distinct gt substitution tuples {len(recipes_to_substitution_tuple_candidates)}")

nr_recipe_pair_combinations = 0
for sub_pair, recipe_list in list(recipes_to_substitution_tuple_candidates.items()):
    nr_recipe_pair_combinations += len(recipe_list)
print(f"total number of recipe-tuple pairs {nr_recipe_pair_combinations}")

number of distinct gt substitution tuples 768


total number of recipe-tuple pairs 5302


In [31]:
with open(os.path.abspath("./outputs/recipes_to_substitution_tuple_candidate_common_filtered.pkl"), "wb") as tmp_res_file:
    pickle.dump(recipes_to_substitution_tuple_candidates, tmp_res_file)

In [23]:
def printPairsForIngredientToken(ingredient_token, pairs):
    for sub_pair in pairs:
        if ingredient_token in sub_pair[0]:
            print(sub_pair)

query_ingredient = "ice"
printPairsForIngredientToken(query_ingredient, list(recipes_to_substitution_tuple_candidates.keys()))

('sliced_almond', 'walnut')
('diced_tomato', 'tomato_sauce')
('sliced_almond', 'pecan')
('sliced_mushroom', 'broccoli')
('vermicelli', 'spaghetti')
('diced_tomato', 'crushed_tomato')
('diced_tomato', 'rotel_tomato')
('ice_cube', 'banana')
('sliced_mushroom', 'onion')
('ice', 'banana')
('ice', 'water')


In [24]:
print(list(recipes_to_substitution_tuple_candidates.keys()))

[('lean_ground_beef', 'chicken_breast'), ('goat_cheese', 'feta_cheese'), ('lemon', 'orange'), ('bay_scallop', 'shrimp'), ('chicken', 'turkey'), ('blueberry', 'strawberry'), ('apple', 'applesauce'), ('leek', 'scallion'), ('ground_beef', 'ground_turkey'), ('ground_beef', 'ground_sausage'), ('ground_beef', 'turkey_sausage'), ('ground_beef', 'turkey'), ('shortening', 'butter'), ('ground_turkey', 'ground_beef'), ('raisin', 'dried_apricot'), ('corn_flake', 'french_fried_onion'), ('green_pepper', 'red_pepper'), ('ground_turkey', 'beef'), ('ground_beef', 'ground_venison'), ('pistachio', 'almond'), ('catsup', 'tomato_sauce'), ('parsley', 'cilantro'), ('tomato_sauce', 'diced_tomato'), ('white_wine', 'chicken_broth'), ('yellow_squash', 'zucchini'), ('half_and_half', 'heavy_cream'), ('salsa', 'rotel_tomato'), ('salsa', 'rotel'), ('salsa', 'diced_tomato'), ('walnut', 'pecan'), ('semi_sweet_chocolate_chip', 'white_chocolate_chip'), ('italian_sausage', 'ground_beef'), ('banana', 'applesauce'), ('peca

In [29]:
short_recipe_count = 0

for sub_pair, recipe_list in list(recipes_to_substitution_tuple_candidates.items()):
    recipes_to_substitution_tuple_candidates[sub_pair] = [recipe for recipe in recipe_list if len(recipe["ingredients"])>=3]


for sub_pair, recipe_list in list(recipes_to_substitution_tuple_candidates.items()):
    for recipe in recipe_list:
        if len(recipe["ingredients"]) < 4:
            short_recipe_count += 1
        # if len(recipe["ingredients"])<4:
        #     print(recipe["original_ingredients"])
    if len(recipe_list) < 3:
        print(f"recipes for recipe pair {sub_pair}: {len(recipe_list)}")
    # print(recipe_ingredient_counts[sub_pair[0]])

print(short_recipe_count)

63


## Make final preparations
- sample form the candidate list, with 2-3 recipes per sub pair (maybe apply some of additional criteria layed out in the draft)
- maybe balance that it is not too much meat or nuts
- fill up with 2-3 recipes where the ingredient was not recommended
- send it

### sample from source ingredients with a range of how common they are in the recipe dataset.

In [30]:
# print(recipe_ingredient_counts)
_samples_ordered_by_source_ingredient_counts = [(sample, recipe_ingredient_counts[sample[0][0]]) for sample in list(recipes_to_substitution_tuple_candidates.items()) if sample[0][0] in recipe_ingredient_counts]
_samples_ordered_by_source_ingredient_counts = sorted(_samples_ordered_by_source_ingredient_counts, key=lambda x: x[1], reverse=True)

samples_ordered_by_source_ingredient_counts = {sample[0]: sample[1] for sample, _freq in _samples_ordered_by_source_ingredient_counts}

print(list(samples_ordered_by_source_ingredient_counts.items())[0])



(('paprika', 'chili_powder'), [{'id': '1314caaf2a', 'ingredients': [['black_pepper', 'coarse_black_pepper', 'mccormick_black_pepper'], ['garlic_powder', 'salt_free_garlic_powder', 'garlic_powder_with_parsley'], ['paprika'], ['pork_chop', 'pork_chops', 'pork_neck'], ['vegetable_oil', 'vegetable_oil_cooking_spray', 'castor_oil'], ['bacon_grease'], ['flour']], 'subs': ['paprika', 'chili_powder'], 'instructions': ['Heat oven to 250F (low).', 'Mix spices together and rub generously onto both sides of chops.', 'Set aside.', 'Heat oil and bacon grease in a large skillet.', 'Dredge chops in flour (using a plastic grocery bag makes it super easy) and sear chops (in batches) over medium high heat just to brown the outside.', 'Add extra oil if necessary.', 'Place the chops into a 9x13 pan directly from the pan so the excess grease also goes into the baking dish.', 'When all the chops are seared and in the baking dish, cover very tightly with foil.', 'Bake for about 2 hours for perfect tenderness.

In [36]:
CANDIDATE_SAMPLES_SET_PATH = os.path.abspath("./outputs/survey_candidate_samples.pkl")

desired_nr_sample_keys = 190
sample_stride = len(list(samples_ordered_by_source_ingredient_counts.items())) // desired_nr_sample_keys
print(f"sample stride: {sample_stride}")
substitution_candidates_with_gt_recipes = {sub_pair: recipe_list for sub_pair, recipe_list in list(samples_ordered_by_source_ingredient_counts.items())[::sample_stride]}

print(f"number of sampled substitution pairs wiht gt reicpe lists appended {len(substitution_candidates_with_gt_recipes)}")

# print_substitution_candidates_with_gt_recipes = [[list(sub_pair), recipe] for sub_pair, recipe in substitution_candidates_with_gt_recipes]
with open(CANDIDATE_SAMPLES_SET_PATH, "wb") as candidate_samples_file:
    pickle.dump(substitution_candidates_with_gt_recipes, candidate_samples_file, protocol=pickle.HIGHEST_PROTOCOL)


sample stride: 3
number of sampled substitution pairs wiht gt reicpe lists appended 252


### Build actual samples with 3 recipes from the gt set where the gt sub is recommended

In [178]:
nr_of_recipes_to_include = 3

final_samples = []

for substitution_tuple, recipe_list in list(substitution_candidates_with_gt_recipes.items()):
    sample_recipes_for_sub_pair = random.sample(recipe_list, nr_of_recipes_to_include)
    for recipe in sample_recipes_for_sub_pair:
        final_samples.append((substitution_tuple, recipe))

print(len(final_samples))
for final_samples_item in final_samples[:4]:
    print(final_samples_item)
    print("\n")


756
(('paprika', 'chili_powder'), {'id': '1314caaf2a', 'ingredients': [['black_pepper', 'coarse_black_pepper', 'mccormick_black_pepper'], ['garlic_powder', 'salt_free_garlic_powder', 'garlic_powder_with_parsley'], ['paprika'], ['pork_chop', 'pork_chops', 'pork_neck'], ['vegetable_oil', 'vegetable_oil_cooking_spray', 'castor_oil'], ['bacon_grease'], ['flour']], 'subs': ['paprika', 'chili_powder'], 'instructions': ['Heat oven to 250F (low).', 'Mix spices together and rub generously onto both sides of chops.', 'Set aside.', 'Heat oil and bacon grease in a large skillet.', 'Dredge chops in flour (using a plastic grocery bag makes it super easy) and sear chops (in batches) over medium high heat just to brown the outside.', 'Add extra oil if necessary.', 'Place the chops into a 9x13 pan directly from the pan so the excess grease also goes into the baking dish.', 'When all the chops are seared and in the baking dish, cover very tightly with foil.', 'Bake for about 2 hours for perfect tenderne

### Add to actual samples with 2 recipes from the recipe set where the gt sub is not explicitly recommended

In [179]:
# for all considered gt_sub_pairs, conisder all recipes that contain the source ingredient but that don't have the sub pair as gt_sub_pair

def getRecipesSubsAreIngredientsButNotGT(gt_source, gt_target, recipes_extended_dict):
    matched_recipes = []
    for recipe_id, recipe in list(recipes_extended_dict.items()):
        ingredients = recipe["ingredients"]

        # TODO exclude recipes where (gt_source, gt_target) is already in the sub list

        has_gt_source_as_sub_source = False
        subs_collection = recipe["subs_collection"]
        for sub in subs_collection:
            if isinstance(sub[0], list):
                sub_source = sub[0][0]
            else:
                sub_source = sub[0]
            if gt_source == sub_source:
                has_gt_source_as_sub_source = True
                break
        if has_gt_source_as_sub_source:
            continue

        for ingredient in ingredients:
            for ingredient_variant in ingredient:
                if ingredient_variant == gt_source:
                    matched_recipes.append(recipe)
                else:
                    continue
    return matched_recipes


alternative_potential_recipes = {}
for sub_pair, _recipe_list in list(substitution_candidates_with_gt_recipes.items()):
    source = sub_pair[0]
    target = sub_pair[1]
    alternative_recipes = getRecipesSubsAreIngredientsButNotGT(source, target, recipes_extended_dict)
    if len(alternative_recipes) > 3:
        alternative_potential_recipes[sub_pair] = alternative_recipes

for sub_pair, _recipe_list in list(substitution_candidates_with_gt_recipes.items()):
    if sub_pair not in list(alternative_potential_recipes.keys()):
        print(sub_pair)



In [180]:
print(list(alternative_potential_recipes.keys())[80])
print(list(alternative_potential_recipes.values())[80][5])

('salsa', 'tomato')
{'id': '0487b7c56a', 'ingredients': [['olive_oil', 'lemon_infused_olive_oil'], ['scallion', 'scallions'], ['garlic_clove', 'garlic_cloves'], ['green_chilies', 'green_chili', 'green_chilies'], ['black_bean', 'black_beans', 'black_bean_salsa'], ['water', 'hot_water'], ['salsa'], ['salt_and_pepper']], 'subs': ['water', 'chicken_broth'], 'instructions': ['In a medium saucepan, heat olive oil over medium heat.', 'Add scallions and garlic cloves and saute until fragrant, about 1 minute.', 'Add black beans and water.', 'Increase heat and bring to a boil.', 'Reduce heat and simmer for about 30 minutes, stirring occasionally.', 'Add salsa and stir until heated through.', 'Add salt and pepper to taste.'], 'title': 'Soupy Black Beans', 'original_ingredients': ['1 teaspoon olive oil', '2 scallions, thinly sliced', '2 garlic cloves, minced', '2 tablespoons green chilies, chopped', '2 (14 1/2 ounce) cans black beans, rinsed and drained', '4 cups water', '14 cup salsa', 'salt and 

In [181]:
# sample the alternative recipes
nr_of_alt_recipes_to_include = 2

for substitution_tuple, recipe_list in list(alternative_potential_recipes.items()):
    sample_recipes_for_sub_pair = random.sample(recipe_list, nr_of_alt_recipes_to_include)
    for recipe in sample_recipes_for_sub_pair:
        final_samples.append((substitution_tuple, recipe))

print(len(final_samples))
for final_samples_item in final_samples[-4:]:
    print(final_samples_item)
    print("\n")

1260
(('fat_free_ricotta_cheese', 'cottage_cheese'), {'id': 'a3223573df', 'ingredients': [['garbanzo_bean', 'garbanzo_beans'], ['lemon_juice'], ['tahini'], ['sesame_oil', 'wok_oil'], ['fat_free_ricotta_cheese'], ['garlic_clove', 'garlic_cloves'], ['green_onion', 'green_onions'], ['green_pepper', 'green_peppers'], ['carrot', 'carrots'], ['cucumber', 'cucumbers', 'lemon_cucumber'], ['ground_cumin'], ['paprika'], ['chili_powder'], ['fresh_parsley'], ['black_pepper', 'coarse_black_pepper', 'mccormick_black_pepper'], ['red_pepper_flake', 'red_pepper_flakes']], 'subs': ['tahini', 'peanut_butter'], 'instructions': ['In a medium bowl, mash the garbanzo beans to your desired texture with a potato masher.', 'A fork would also work.', 'Add lemon juice, tahini, sesame oil, and ricotta.', 'Mix well.', 'Combine garlic, spices, and vegetables with hummus mixture (you may want a bit more or less of each of these depending on your tastes).', 'Add black and red pepper to taste.', 'Serve with pita and fr

### finally save the dataset

In [184]:
with open(SURVEY_COMPLETE_SUB_TUPLE_AND_RECIPE_SET_PATH, "w") as json_file:
    json.dump(final_samples, json_file, indent=2)

# Distribute the suvey data into 2 question sets, add questions IDs

In [83]:
with open(SURVEY_COMPLETE_SUB_TUPLE_AND_RECIPE_SET_PATH, "r") as survey_data_file:
    survey_data = json.load(survey_data_file)

print(len(survey_data))

1260


In [84]:
subs = [datum[0] for datum in survey_data]

already_printed = []
print(len(subs))
additional_filters = [
    ['paprika', 'chili_powder'],
    ['walnut', 'macadamia_nut'],
    ['lemon', 'lime'],
    ['green_pepper', 'red_bell_pepper'],
    ['pecan', 'slivered_almond'],
    ['tomato_sauce', 'pasta_sauce'],
    ['tomato_sauce', 'water'],
    ['tomato_paste', 'ketchup'],
    ['chicken_stock', 'broth'],
    ['chicken_stock', 'turkey_stock'],
    ['chicken_stock', 'chicken_broth'],
    ['green_bell_pepper', 'red_pepper'],
    ['ketchup', 'tomato_sauce'],
    ['scallion', 'chive'],
    ['dry_white_wine', 'chicken_stock'],
    ['vinegar', 'apple_cider_vinegar'],
    ['balsamic_vinegar', 'balsamic_glaze'],
    ['salsa', 'taco_sauce'],
    ['cooked_chicken', 'meat'],
    ['blueberry', 'blackberry'],
    ['blueberry', 'saskatoon_berry'],
    ['white_vinegar', 'white_wine_vinegar'],
    ['leek', 'scallion'],
    ['elbow_macaroni', 'penne_pasta'],
    ['cherry_tomato', 'plum_tomato'],
    ['fresh_chive', 'scallion'],
    ['spaghetti', 'fettuccine'],
    ['cool_whip', 'whipping_cream'],
    ['ice', 'banana'],
    ['tomato_puree', 'diced_tomato'],
    ['italian_sausage', 'meat'],
    ['catsup', 'tomato_sauce'],
    ['miracle_whip', 'mayonnaise'],
    ['lard', 'butter'],
    ['whole_almond', 'pecan'],
    ['walnut_half', 'almond_half'],
    ['macaroni', 'egg_noodle']
]

survey_data = [data for data in survey_data if data[0] not in additional_filters]

# for sub in subs:
#     if sub not in already_printed:
#         already_printed.append(sub)
#         print(sub)

print(len(survey_data))

1260
1075


In [18]:
survey_question_set_A = []
survey_question_set_B = []
for i, survey_datum in enumerate(survey_data):
    survey_datum.append(i)
    if i % 2 == 0:
        survey_question_set_A.append(survey_datum)
    else:
        survey_question_set_B.append(survey_datum)

random.shuffle(survey_question_set_A)
random.shuffle(survey_question_set_B)

with open(SURVEY_QUESTION_SET_A_PATH, "w") as question_set_a_file:
    json.dump(survey_question_set_A, question_set_a_file, indent=2)

with open(SURVEY_QUESTION_SET_B_PATH, "w") as question_set_b_file:
    json.dump(survey_question_set_B, question_set_b_file, indent=2)


print(len(survey_question_set_A))
print(len(survey_question_set_B))

458
457


In [85]:
# print("\n".join(set(["->".join(p[0]) for p in survey_data])))

even_more_filters = [
    ['green_grape', 'red_grape'],
    ['apple', 'applesauce'],
    ['macaroni', 'egg_noodle'],
    ['cherry_tomato', 'plum_tomato'],
    ['green_pepper', 'red_bell_pepper'],
    ['cider_vinegar', 'white_wine_vinegar'],
    ['elbow_macaroni', 'penne_pasta'],
    ['graham_cracker_crumb', 'wafer'],
    ['chicken_piece', 'chicken_breast'],
    ['fat_free_ricotta_cheese', 'cottage_cheese'],
    ['fresh_chive', 'scallion'],
    ['half_and_half', 'fat'],
    ['paprika', 'chili_powder'],
    ['ice', 'banana'],
    ['feta_cheese', 'goat_cheese'],
    ['ketchup', 'tomato_sauce'],
    ['blueberry', 'blackberry'],
    ['cheddar_cheese', 'fat'],
    ['marsala_wine', 'white_wine'],
    ['italian_sausage', 'meat'],
    ['ground_beef', 'sausage'],
    ['green_olive', 'black_olive'],
    ['cool_whip', 'whipping_cream'],
    ['low_sodium_chicken_broth', 'beef_broth'],
    ['cheese_tortellini', 'cheese_ravioli'],
    ['diced_tomato', 'crushed_tomato'],
    ['chicken_stock', 'chicken_broth'],
    ['chicken_stock', 'turkey_stock'],
    ['low_sodium_chicken_broth', 'vegetable_broth'],
    ['ham', 'sausage'],
    ['walnut', 'macadamia_nut'],
    ['fresh_tomato', 'diced_tomato'],
    ['cocoa_powder', 'hot_cocoa_mix'],
    ['catsup', 'tomato_sauce'],
    ['beef_stock', 'water'],
    ['cooked_chicken', 'meat'],
    ['american_cheese', 'cheddar_cheese'],
    ['green_bell_pepper', 'red_pepper'],
    ['spaghetti', 'fettuccine'],
    ['tomato_sauce', 'pasta_sauce'],
    ['pecan_half', 'mixed_nut'],
    ['miracle_whip', 'mayonnaise'],
    ['oat_bran', 'wheat_bran'],
    ['whole_almond', 'pecan'],
    ['chicken_stock', 'broth'],
    ['pecan', 'slivered_almond'],
    ['balsamic_vinegar', 'balsamic_glaze'],
    ['salsa', 'rotel'],
    ['ground_beef', 'hamburger'],
    ['tomato_puree', 'diced_tomato'],
    ['almond', 'pecan'],
    ['deli_ham', 'bacon'],
    ['pumpkin', 'butternut_squash'],
    ['hamburger', 'sausage'],
    ['white_vinegar', 'white_wine_vinegar'],
    ['beef_broth', 'chicken_broth'],
    ['dry_white_wine', 'water'],
    ['vinegar', 'apple_cider_vinegar'],
    ['ground_beef', 'meat']
]

survey_data = [data for data in survey_data if data[0] not in even_more_filters]

print(len(survey_data))

# print([data[0] for data in survey_data])
print(len(set(["->".join(p[0]) for p in survey_data])))
print("\n".join(set(["->".join(p[0]) for p in survey_data])))

# print([p[0] for p in survey_data])

915
183
ground_lamb->ground_beef
monterey_jack_cheese->colby_cheese
tomato_sauce->salsa
ground_beef->turkey_sausage
sliced_almond->pecan
pork_sausage->bacon
walnut->pecan
pork_tenderloin->shrimp
turkey->chicken
frozen_chopped_spinach->broccoli
pistachio->pecan
boneless_skinless_chicken_breast->shrimp
chicken_wing->chicken_leg
ketchup->chili_sauce
white_wine->lemon_juice
semi_sweet_chocolate_chip->walnut
dry_red_wine->beef_broth
walnut->slivered_almond
unsweetened_cocoa_powder->chocolate
feta_cheese->gorgonzola
half_and_half->whipping_cream
raisin->pecan
slivered_almond->walnut
walnut->chocolate_chip
tahini->sesame_oil
italian_sausage->ground_beef
broccoli_floret->green_bean
triple_sec->orange_juice
shortening->lard
extra_large_shrimp->chicken
canned_pumpkin->squash
chocolate_chip->peanut_butter_chip
zucchini->yellow_squash
black_bean->kidney_bean
white_vinegar->apple_cider_vinegar
pecan->mixed_nut
ground_beef->lean_ground_turkey
mushroom->red_bell_pepper
refried_bean->black_bean
bourbo

In [86]:

more_and_more_filters = [
    ['pistachio', 'pecan'],
    ['walnut', 'cashew'],
    ['crabmeat', 'shrimp'],
    ['vegetable_stock', 'beef_broth'],
    ['pecan', 'cashew'],
    ['pork_tenderloin', 'pork_loin_roast'],
    ['fresh_pumpkin', 'squash'],
    ['dry_white_wine', 'chicken_broth'],
    ['oleo', 'butter'],
    ['bacon_grease', 'butter'],
    ['semi_sweet_chocolate_chip', 'peanut_butter_chip'],
    ['nut', 'raisin'],
    ['half_and_half', 'evaporated_milk'],
    ['lean_ground_beef', 'ground_sausage'],
    ['unsweetened_applesauce', 'banana'],
    ['vegetable_broth', 'water'],
    ['currant', 'raisin'],
    ['dried_currant', 'dried_cranberry'],
    ['italian_sausage', 'meatball'],
    ['vegetable_broth', 'water'],
    ['monterey_jack_cheese', 'colby_cheese'],
    ['sole_fillet', 'cod'],
    ['frozen_hash_brown', 'tater_tot'],
    ['cooked_chicken', 'turkey'],
    ['nut', 'raisin'],
    ['blueberry', 'dried_cranberry'],
    ['ketchup', 'barbecue_sauce'],
    ['raisin', 'blueberry'],
    ['lean_ground_beef', 'ground_pork'],
    ['shortening', 'unsalted_butter'],
    ['bourbon', 'whiskey'],
    ['salsa', 'tomato'],
    ['corn_flake', 'ritz_cracker'],
    ['red_wine', 'water'],
    ['beer', 'beef_broth'],
    ['dried_cranberry', 'dried_cherry'],
    ['beef_stew_meat', 'roast'],
    ['cracker', 'breadcrumb'],
    ['vegetable_shortening', 'butter'],
    ['sliced_almond', 'pecan'],
    ['half_and_half', 'whipping_cream'],
    ['tomato_sauce', 'salsa'],
    ['slivered_almond', 'walnut'],
    ['bacon', 'breakfast_sausage'],
    ['walnut_piece', 'almond'],
    ['canned_pumpkin', 'butternut_squash'],
    ['nectarine', 'peach'],
    ['cauliflower_floret', 'broccoli_floret'],
    ['beef_broth', 'red_wine'],
    ['chicken', 'turkey_breast'],
    ['shortening', 'vegetable_oil'],
    ['ground_beef', 'roast_beef'],
    ['cranberry', 'dried_cherry'],
    ['currant', 'date'],
    ['extra_lean_ground_beef', 'ground_turkey'],
    ['bisquick', 'flour'],
    ['chili_sauce', 'ketchup'],
    ['ground_chicken', 'turkey'],
    ['vermicelli', 'spaghetti'],
    ['frozen_pea', 'green_bean'],
    ['yellow_cornmeal', 'polenta'],
    ['blueberry', 'raspberry'],
    ['chicken_wing', 'chicken_leg'],
    ['shortening', 'lard'],
    ['semi_sweet_chocolate_chip', 'walnut'],
    ['triple_sec', 'orange_juice'],
    ['red_wine', 'white_wine'],
    ['golden_raisin', 'dried_cranberry'],
    ['spaghetti_sauce', 'marinara_sauce'],
    ['ketchup', 'chili_sauce'],
    ['bacon', 'turkey_bacon'],
    ['raisin', 'pecan'],
    ['walnut', 'slivered_almond'],
    ['ground_beef', 'bulk_sausage'],
    ['flank_steak', 'sirloin_steak'],
    ['italian_sausage', 'turkey_sausage'],
    ['cooked_ham', 'turkey_bacon'],
    ['tequila', 'vodka'],
    ['ground_turkey', 'lean_ground_beef'],
    ['white_wine', 'lemon_juice'],
    ['lean_ground_turkey', 'beef'],
    ['lean_ground_beef', 'turkey_meat'],
    ['ground_beef', 'turkey_sausage'],
    ['ground_beef', 'breakfast_sausage'],
    ['pecan', 'mixed_nut'],
]

survey_data = [data for data in survey_data if data[0] not in more_and_more_filters]

print(len(survey_data))

500


In [87]:
for i, survey_datum in enumerate(survey_data):
    survey_datum.append(i)

with open(SURVEY_QUESTION_500_SET_PATH, "w") as question_set_a_file:
    json.dump(survey_data, question_set_a_file, indent=2)


In [88]:
#remained substitution tuples
print(len(set(["->".join(p[0]) for p in survey_data])))
print("\n".join(set(["->".join(p[0]) for p in survey_data])))

100
ground_lamb->ground_beef
pork_sausage->bacon
walnut->pecan
pork_tenderloin->shrimp
turkey->chicken
frozen_chopped_spinach->broccoli
boneless_skinless_chicken_breast->shrimp
dry_red_wine->beef_broth
unsweetened_cocoa_powder->chocolate
feta_cheese->gorgonzola
walnut->chocolate_chip
tahini->sesame_oil
italian_sausage->ground_beef
broccoli_floret->green_bean
extra_large_shrimp->chicken
canned_pumpkin->squash
chocolate_chip->peanut_butter_chip
zucchini->yellow_squash
black_bean->kidney_bean
refried_bean->black_bean
mushroom->red_bell_pepper
ground_beef->lean_ground_turkey
white_vinegar->apple_cider_vinegar
chocolate_chip->blueberry
parsley->basil
chicken_breast->sausage
pork_roast->pork_chop
white_wine->white_wine_vinegar
plain_fat_free_yogurt->fat_free_sour_cream
cottage_cheese->ricotta_cheese
gorgonzola->goat_cheese
lean_ground_beef->chicken
ground_pork->ground_chicken
green_bean->snow_pea
kidney_bean->black_bean
fresh_raspberry->blueberry
ground_beef->pepperoni
vegetable_stock->chick

# Get additional Recipes with nutritional information

In [32]:
with open("C:/UM/Master/FoodRecommendations/datasources/Recipe1M/recipes_with_nutritional_info.json", 'r') as recipes_original_with_nutr_info_path:
    recipes_original_with_nutritional_info = json.load(recipes_original_with_nutr_info_path)

recipes_orig_w_nutr_info_dict = {recipe["id"]: recipe for recipe in recipes_original_with_nutritional_info}

for recipe_id, recipe in recipes_orig_w_nutr_info_dict.items():
    recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"] = [ingr["text"] for ingr in recipe["ingredients"]]


def add_most_frequent_ingredient_info_to_orig_w_nutr_info(orig_recipes_w_nutr):
    for recipe_id, recipe in orig_recipes_w_nutr.items():
        ingredients_n_weight = {ingredient_name: ingredient_weight for ingredient_name, ingredient_weight in list(zip(recipe["ingredients"], recipe["weight_per_ingr"]))}
        ingredients_n_weight = dict(sorted(ingredients_n_weight.items(), key=lambda item: item[1], reverse=True))
        orig_recipes_w_nutr[recipe_id]["weight_per_ingr"] = ingredients_n_weight
        # then sort is
    return orig_recipes_w_nutr

recipes_orig_w_nutr_info_dict = add_most_frequent_ingredient_info_to_orig_w_nutr_info(recipes_orig_w_nutr_info_dict)

for i in range(5):
    print(list(recipes_orig_w_nutr_info_dict.items())[i])

('000095fc1d', {'fsa_lights_per100g': {'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}, 'id': '000095fc1d', 'ingredients': ['yogurt, greek, plain, nonfat', 'strawberries, raw', 'cereals ready-to-eat, granola, homemade'], 'instructions': [{'text': 'Layer all ingredients in a serving dish.'}], 'nutr_per_ingredient': [{'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}, {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}, {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}], 'nutr_values_per100g': {'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}, 'partition': 'train', 'quantity': [{'text': '8'}, {'text': '1'}, {'text': '1/4'}], 'title': 'Yogurt Parfaits', 'unit': [{'text': 'ounce'}, {'text': 'cup'},

In [33]:

matching_recipe_ids_from_gismo_and_nutriinfo = list(set(recipes_extended_dict.keys()) & set(recipes_orig_w_nutr_info_dict.keys()))

In [34]:
print(len(matching_recipe_ids_from_gismo_and_nutriinfo))

2498


## Verify that matching recipes are indeed the same

In [35]:
recipe_id = matching_recipe_ids_from_gismo_and_nutriinfo[273]
print(f"{recipes_orig_w_nutr_info_dict[recipe_id]['ingredients']}\n")
print(f"{recipes_extended_dict[recipe_id]}")

['cheese, parmesan, hard', 'sauce, worcestershire', 'spices, garlic powder', 'catsup', 'oil, olive, salad or cooking']

{'id': '42270b17e8', 'ingredients': [['cream_cheese'], ['worcestershire_sauce'], ['garlic_salt'], ['ketchup'], ['vegetable_oil', 'vegetable_oil_cooking_spray', 'castor_oil']], 'subs': ['garlic_salt', 'garlic_powder'], 'instructions': ['Mix all ingredients until creamy and smooth.', 'Chill.', 'Serve with potato chips, corn chips or pretzels.'], 'title': 'Schneider Family Cheese Dip', 'original_ingredients': ['8 ounces cream cheese', '12 tablespoon Worcestershire sauce', '1 teaspoon garlic salt', '13 cup ketchup', '14 cup vegetable oil'], 'ingredient_quantities': {'8 ounces cream cheese': 226.796, '12 tablespoon Worcestershire sauce': 7.3934, '1 teaspoon garlic salt': 4.92892, '13 cup ketchup': 78.86266666666666, '14 cup vegetable oil': 59.147}, 'subs_collection': [['garlic_salt', 'garlic_powder']]}


## Build map between gismo/flavorgraph ingredients and recipe1m with nutri info ingredients

Actually the recipes lists seem to be in the same order in both sources, but I wanted to try how good the matching via these other techniques is

In [36]:
def get_gizmo_name_from_reipce1mnutri_ingredient(ingredient_name, recipe_gismo, recipe_1mnutri):
    for gismo_ingr_name, recipe1mnutri_ingr_name in zip(recipe_gismo["ingredients"], recipe_1mnutri["ingredients"]):
        if recipe1mnutri_ingr_name == ingredient_name:
            return gismo_ingr_name
    raise Exception("ingredient could not be matched for {ingredient_name} in recipe {recipe['id']}")



## Get relative ranks for recipes with nutritional info

In [37]:

def get_relative_weight_rank_of_gt_ingredient_from_1mnutri(gt_ingredient_gismo, recipe_gismo, recipe_1mnutri):
    ingredient_nutri = None
    for _ingr_gismo, _ingr_1mnutri in zip(recipe_gismo["ingredients"], recipe_1mnutri["ingredients"]):
        if isinstance(_ingr_gismo, list):
            for _ingr_var_gismo in _ingr_gismo:
                if _ingr_var_gismo == gt_ingredient_gismo:
                    ingredient_nutri = _ingr_1mnutri
        else:
            if _ingr_gismo == gt_ingredient_gismo:
                ingredient_nutri = _ingr_1mnutri
    if ingredient_nutri is None:
        return -1

    for rank, _ingr_name in enumerate(list(recipe_1mnutri["weight_per_ingr"].keys())):
        if ingredient_nutri == _ingr_name:
            return rank / len(recipe_1mnutri["weight_per_ingr"].keys())

In [38]:
for recipe_id, recipe in list(recipes_extended_dict.items()):
    if len(recipe["subs"])>2:
        print(recipe["subs"])
        break

In [39]:
candidate_substitution_recipes_from_1mnutri = {} #key = substitution pair, value = recipe

in_bounds_counter = 0
out_of_bounds_counter = 0
for recipe_id in matching_recipe_ids_from_gismo_and_nutriinfo:
    recipe_gismo = recipes_extended_dict[recipe_id]
    recipe_1mnutri = recipes_orig_w_nutr_info_dict[recipe_id]
    gt_subs = recipe_gismo["subs"]

    if isinstance(gt_subs[0], list):
        for gt_sub in gt_subs:
            source_ingr = gt_sub[0]
            source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient_from_1mnutri(source_ingr, recipe_gismo, recipe_1mnutri)
            if source_relative_weight_rank <= 0.1:
                if not gt_sub in list(candidate_substitution_recipes_from_1mnutri.keys()):
                    candidate_substitution_recipes_from_1mnutri[gt_sub] = []
                candidate_substitution_recipes_from_1mnutri[gt_sub].append(recipe_gismo)
                in_bounds_counter += 1
            else:
                out_of_bounds_counter += 1
    else:
        gt_sub = gt_subs
        source_ingr = gt_sub[0]
        target_ingr = gt_sub[1]
        sub_tuple = (source_ingr, target_ingr)
        source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient_from_1mnutri(source_ingr, recipe_gismo, recipe_1mnutri)
        if source_relative_weight_rank <= 0.1:
            if not sub_tuple in list(candidate_substitution_recipes_from_1mnutri.keys()):
                candidate_substitution_recipes_from_1mnutri[sub_tuple] = []
            candidate_substitution_recipes_from_1mnutri[sub_tuple].append(recipe_gismo)
            in_bounds_counter += 1
        else:
            out_of_bounds_counter += 1

print(in_bounds_counter)
print(out_of_bounds_counter)

    # for gt_sub in gt_subs:
    #     source_ingr = gt_sub[0]
    #     source_relative_weight_rank = get_relative_weight_rank_of_gt_ingredient_from_1mnutri(source_ingr, recipe_gismo, recipe_1mnutri)
    #     if source_relative_weight_rank <= 0.25:
    #         if not gt_sub in list(candidate_substitution_recipes_from_1mnutri.keys()):
    #             candidate_substitution_recipes_from_1mnutri[gt_sub] = []
    #         candidate_substitution_recipes_from_1mnutri[gt_sub].append(recipe_gismo))

597
1901


In [40]:

## filter the remaining recipes with nutri info in which the gt source plays a majo role
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not checkIngredientContainsBlacklistPhrase(pair[0], BLACKLIST_PHRASES)}

## and filter out the very most common ingredients from the gismo dataset
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not pair[0] in very_most_common_ingrs}

print(f"number of substitution pairs for recipes in which the source is a major component: {len(candidate_substitution_recipes_from_1mnutri)}")
print(f"substitution pairs {list(candidate_substitution_recipes_from_1mnutri.keys())}")

number of substitution pairs for recipes in which the source is a major component: 185
substitution pairs [('ketchup', 'tomato_sauce'), ('red_wine_vinegar', 'white_wine_vinegar'), ('apricot_preserve', 'mango_chutney'), ('cider_vinegar', 'white_wine_vinegar'), ('boneless_skinless_chicken_breast', 'ground_turkey_breast'), ('crushed_pineapple', 'fruit'), ('agave_nectar', 'maple_syrup'), ('white_wine_vinegar', 'balsamic_vinegar'), ('turkey_sausage', 'turkey_pepperoni'), ('pecan', 'almond'), ('rolled_oat', 'oat_bran'), ('non_dairy_chocolate_chip', 'cocoa_powder'), ('balsamic_vinaigrette', 'white_balsamic_vinegar'), ('chicken', 'rib'), ('white_chocolate_chip', 'sweet_chocolate'), ('savoy_cabbage', 'kale'), ('lean_ground_beef', 'turkey'), ('salmon_fillet', 'dried_dill'), ('chocolate_candy_melts', 'crunchy_peanut_butter'), ('lettuce', 'spinach'), ('plain_fat_free_yogurt', 'fat_free_sour_cream'), ('tart_apple', 'frozen_peach'), ('barley', 'barley_flour'), ('unsweetened_applesauce', 'pumpkin'), 

In [41]:
# try filtering out rare ingredients
candidate_substitution_recipes_from_1mnutri = {pair: recipes for pair, recipes in list(candidate_substitution_recipes_from_1mnutri.items()) if not pair[0] in least_common_ingredients}

print(f"number of substitution pairs for recipes in which the source is a major component: {len(candidate_substitution_recipes_from_1mnutri)}")
print(f"substitution pairs {list(candidate_substitution_recipes_from_1mnutri.keys())}")

number of substitution pairs for recipes in which the source is a major component: 180
substitution pairs [('ketchup', 'tomato_sauce'), ('red_wine_vinegar', 'white_wine_vinegar'), ('apricot_preserve', 'mango_chutney'), ('cider_vinegar', 'white_wine_vinegar'), ('boneless_skinless_chicken_breast', 'ground_turkey_breast'), ('crushed_pineapple', 'fruit'), ('agave_nectar', 'maple_syrup'), ('white_wine_vinegar', 'balsamic_vinegar'), ('turkey_sausage', 'turkey_pepperoni'), ('pecan', 'almond'), ('rolled_oat', 'oat_bran'), ('non_dairy_chocolate_chip', 'cocoa_powder'), ('balsamic_vinaigrette', 'white_balsamic_vinegar'), ('chicken', 'rib'), ('white_chocolate_chip', 'sweet_chocolate'), ('savoy_cabbage', 'kale'), ('lean_ground_beef', 'turkey'), ('salmon_fillet', 'dried_dill'), ('chocolate_candy_melts', 'crunchy_peanut_butter'), ('lettuce', 'spinach'), ('plain_fat_free_yogurt', 'fat_free_sour_cream'), ('tart_apple', 'frozen_peach'), ('barley', 'barley_flour'), ('unsweetened_applesauce', 'pumpkin'), 

## Inspecting some of these recipes:

In [42]:
# good are [0, 122, 183]
# bad is [12, 144]
test_idx = 144
test_sub_pair = list(candidate_substitution_recipes_from_1mnutri.keys())[test_idx]
print(f"test sub pair: {test_sub_pair}")
test_recipe = candidate_substitution_recipes_from_1mnutri[test_sub_pair][0]
print(f"test recipe id {test_recipe}")

test sub pair: ('dried_sweetened_cranberry', 'golden_raisin')
test recipe id {'id': 'c6b23133db', 'ingredients': [['frozen_peach', 'frozen_peaches'], ['dried_sweetened_cranberry', 'dried_sweetened_cranberries'], ['peach_nectar'], ['brown_sugar'], ['cinnamon']], 'subs': ['dried_sweetened_cranberry', 'golden_raisin'], 'instructions': ['Mix all the ingredients in a medium sized pan, stirring well.', 'Put the top on the pan and turn the heat to medium.', 'Let it cook for 10 minutes or until the peaches are tender.', '1/4 cup of the mixture is 1 WW point.'], 'title': 'Cinnamon Spiced Peaches (1 Pt)', 'original_ingredients': ['2 cups frozen peaches, thawed', '14 cup dried sweetened cranberries', '14 cup peach nectar', '3 tablespoons brown sugar', '12 teaspoon cinnamon'], 'ingredient_quantities': {'2 cups frozen peaches, thawed': 473.176, '14 cup dried sweetened cranberries': 59.147, '14 cup peach nectar': 59.147, '3 tablespoons brown sugar': 44.3604, '12 teaspoon cinnamon': 2.46446}, 'subs_c

# Prepare the rest of the GISMO recipes to get weight, then calculate the most fitting recipes accordingly

# Some more ingredient matching functions (Should not be needed due to each ingredient lists ordering)

In [43]:
import difflib

gismo_to_recipe1mnutri = {}
recipe1mnutri_to_gismo = {}

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ing = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}
for item1 in ingredients1:
    best_match = difflib.get_close_matches(item1, ingredients2, n=1, cutoff=0.1)
    if best_match:
        matching_items[item1] = best_match[0]

    # for item2 in recipes_extended_dict[recipe_id]["ingredients"]:
    #     if isinstance(item2, list):
    #         item2 = item2[0]
    #     item2 = item2.replace('_', ' ')
    #     best_match = difflib.get_close_matches(item1, item2, n=1, cutoff=0.1)
    #     if best_match:
    #         matching_items[item1] = best_match[0]

for item, match in matching_items.items():
    print(f"{item} matches {match}")

beef, grass-fed, ground, raw matches beef_bouillon_powder
spices, pepper, black matches black_pepper
onions, raw matches onion_salt
spices, onion powder matches onion_powder
spices, parsley, dried matches dry_parsley_flake
salt, table matches onion_salt


In [44]:
import spacy
import datetime

# timestamp1 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp1}")
nlp = spacy.load("en_core_web_sm")

# timestamp2 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp2}")


ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ing = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}

for item1 in ingredients1:

    item1_tokens = set(token.text for token in nlp(item1))

    # timestamp3 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # print(f"timestamp1 {timestamp3}")

    for item2 in ingredients2:
        item2_tokens = set(token for token in item2)
        common_tokens = item1_tokens.intersection(item2_tokens)
        if common_tokens:
            matching_items[item1] = item2
            break

# timestamp4 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# print(f"timestamp1 {timestamp4}")

for item, match in matching_items.items():
    print(f"{item} matches {match}")

In [45]:
from fuzzywuzzy import fuzz, process

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ingr = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items_list1 = {}
matching_items_list2 = {}

# Calculate all similarity scores and store the best match for each item in both lists
for item1 in ingredients1:
    best_match, best_score = None, 0

    for item2 in ingredients2:
        score = fuzz.token_sort_ratio(item1, item2)

        if score > best_score:
            best_match = item2
            best_score = score

    if best_score >= 10:  # You can adjust the threshold as needed
        matching_items_list1[item1] = best_match
        matching_items_list2[best_match] = item1

# Print the matched items for list1
print("Matches for list1:")
for item1, item2 in matching_items_list1.items():
    print(f"{item1} matches {item2}")

# Print the matched items for list2
print("\nMatches for list2:")
for item2, item1 in matching_items_list2.items():
    print(f"{item2} matches {item1}")

Matches for list1:
beef, grass-fed, ground, raw matches beef bouillon powder
spices, pepper, black matches black pepper
onions, raw matches onion salt
spices, onion powder matches onion powder
spices, parsley, dried matches dry parsley flake
salt, table matches onion salt

Matches for list2:
beef bouillon powder matches beef, grass-fed, ground, raw
black pepper matches spices, pepper, black
onion salt matches salt, table
onion powder matches spices, onion powder
dry parsley flake matches spices, parsley, dried


In [46]:
# try matching based only over word overlap

ingredients1 = recipes_orig_w_nutr_info_dict[recipe_id]["ingredients"]
ingredients2tmp = recipes_extended_dict[recipe_id]["ingredients"]
ingredients2 = []
for ingr in ingredients2tmp:
    if isinstance(ingr, list):
        ingr = ingr[0]
    ingr = ingr.replace('_', ' ')
    ingredients2.append(ingr)

matching_items = {}

# Calculate all similarity scores and store the best match for each item in both lists
for item1 in ingredients1:
    best_match, best_score = None, 0

    for item2 in ingredients2:
        words = item2.split(" ")
        for word in words:
            if word in item1:
                pass
                #increase counter
        # add the match which the highest counter


SyntaxError: incomplete input (3765032402.py, line 23)