- calculate the other metric (use the util file)
- maybe calculate the third metric as well, but need to be able to map the ingredients to categories
---
- distance metrics:
  - cosine
  - euclidean
  - Hemming (?? because they suggested that in their paper but how does that make sense?)
- normalization
  - entire matrix
  - each ingredient
  - all subst for an ingredient
  - try to balance the overrepresented ingredients such as salt, butter, meal:
    - log before normalization
    - gumbel softmax activation
- maybe filter out over- and underrepresented ingredients
- What do we measure?
  - GT rank
  - select some recipe and see where we get better substitutions (like qualitatively?)

In [2]:
import os
import pickle
import pandas as pd

from calc_recipe_ingredient_info_distances import collectSomeRecipeRecommendations, get_all_comments, get_all_mutual_info, evalRecommendations, getRecommendationsBasedOnMutualInformationRole, get_graph_nodes, get_all_gt_recipes, get_recipes_per_ingredient, get_recipes_per_ingredient_pairs, get_all_frequencies,getNaiveBayesRecommendations

In [3]:
ORDERED_RECIPE_IDS_PATH = os.path.abspath("./outputs/sorted_recipe_ids_list.pkl")
TRAIN_COMMENTS_PATH = os.path.abspath("./inputs/train_comments_subs.pkl") # train recipes with substitutions
TEST_COMMENTS_PATH = os.path.abspath("./inputs/test_comments_subs.pkl") # test recipes with substitutions
VAL_COMMENTS_PATH = os.path.abspath("./inputs/val_comments_subs.pkl") # validation recipes with substitutions
GRAPH_NODES_PATH = os.path.abspath("./inputs/graph/nodes_191120.csv")

MUTUAL_INFO_DICT_PATH = os.path.abspath("./outputs/mutual_info_dict_with_self_info.pkl")
RECIPES_PER_INGREDIENT_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_small.pkl"
)
RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH = os.path.abspath(
    "./outputs/recipes_per_ingredient_pairs_small.pkl"
)
PROCESSED_RECIPES_PATH = os.path.abspath("./outputs/processed_recipes.pkl")

# load all the data

In [4]:

if os.path.isfile(ORDERED_RECIPE_IDS_PATH):
    with open(ORDERED_RECIPE_IDS_PATH, "rb") as file:
        ordered_recipe_ids = pickle.load(file)
recipe_ids_with_ranks = ordered_recipe_ids
ordered_recipe_ids = [recipe[1] for recipe in recipe_ids_with_ranks]

# recipes with comments, lists, names, and gt_truths
extended_recipes = get_all_comments(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH, VAL_COMMENTS_PATH)
recipes_extended_dict = {recipe["id"]:recipe for recipe in extended_recipes}


# recipe ingredient df one hot
# INGREDIENT_RECIPE_MATRIX_PATH = os.path.abspath("./outputs/ingredient_recipe_matrix.pkl")
# recipe_ingredient_df = get_recipe_ingredient_df( ingredients, recipes, recipes_per_ingredient, INGREDIENT_RECIPE_MATRIX_PATH)
# recipe_ingredient_df_bool = recipe_ingredient_df.astype(bool)

with open(MUTUAL_INFO_DICT_PATH, "rb") as mutual_info_dict_file:
            mutual_info_dict = pickle.load(mutual_info_dict_file)

ingredients = get_graph_nodes(GRAPH_NODES_PATH)

recipes = get_all_gt_recipes(TRAIN_COMMENTS_PATH, TEST_COMMENTS_PATH,
                                VAL_COMMENTS_PATH, PROCESSED_RECIPES_PATH)
recipes_per_ingredient = get_recipes_per_ingredient(
    ingredients, recipes, RECIPES_PER_INGREDIENT_SMALL_PATH)

recipes_per_ingredient_pairs = get_recipes_per_ingredient_pairs(
    recipes_per_ingredient, RECIPES_PER_INGREDIENT_PAIRS_SMALL_PATH)
recipe_ingredient_counts, recipe_ingredient_pair_counts = get_all_frequencies(recipes_per_ingredient,
                                        recipes_per_ingredient_pairs)


## Set some additional helper functions

In [87]:
def getRecipeFromComments(recipeId, allComments):
    recipe = None
    for commentRecipe in allComments:
        gt_ingredient_found = False
        comment_recipe_id = commentRecipe['id']
        if recipeId == comment_recipe_id:
            if recipe is None:
                recipe = {}
                recipe["id"] = comment_recipe_id
                recipe["subs"] = []
                recipe["ingredients"] = commentRecipe["ingredients"]

            if recipe is not None:
                recipe["subs"].append(commentRecipe["subs"])

    return recipe

In [98]:
def getRanksFromDirectRecommendations(source_ingredients, recommendations, extended_recipes, offset = -1):
    gt_ranks = []
    for recipe_id, recipe_recommendations in recommendations.items():
        recipe = getRecipeFromComments(recipe_id, extended_recipes)
        gt_subs = [sub[1] for sub in  recipe["subs"] if sub[0] == source_ingredients[recipe_id]] # get all valid substitutions for the source ingredient in the recipe
        recipe_recommendations = list(recipe_recommendations.keys())
        for i, ingredient in enumerate(recipe_recommendations):
            if ingredient in gt_subs:
                gt_ranks.append(i + offset) # to set off the top ranking self-recommendation
                gt_ingredient_found = True
                break
        if not gt_ingredient_found:
            gt_ranks.append(i)
    return gt_ranks

def getRanksFromRecommendationTuples(source_ingredients, recommendations_tuples, extended_recipes, offset = -1):
    # gt_ranks = []
    # for recipe_id, recommendations_tuples in recommendations_tuples.items():
    #     recipe = getRecipeFromComments(recipe_id, extended_recipes)
    #     gt_subs = [sub[1] for sub in  recipe["subs"]]
    #     recommendations = [tuple[0] for tuple in recommendations_tuples]
    #     for i, ingredient in enumerate(recommendations):
    #         if ingredient in gt_subs:
    #             gt_ranks.append(i-1) # to set off the top ranking self-recommendation
    #             continue
    gt_ranks = []
    for recipe_id, recipe_recommendations_tuples in recommendations_tuples.items():
        gt_ingredient_found = False
        recipe = getRecipeFromComments(recipe_id, extended_recipes)
        gt_subs = [sub[1] for sub in  recipe["subs"] if sub[0] == source_ingredients[recipe_id]] # get all valid substitutions for the source ingredient in the recipe
        recommendations = [tuple[0] for tuple in recipe_recommendations_tuples]
        for i, ingredient in enumerate(recommendations):
            if ingredient in gt_subs:
                gt_ranks.append(i + offset) # to set off the top ranking self-recommendation
                gt_ingredient_found = True
                break
        if not gt_ingredient_found:
            gt_ranks.append(i)
            
    return gt_ranks

def compare_recommendation_ranks(recommendation_ranks_1, recommendation_ranks_2, tolerance=10):
    rec_1_better = []
    rec_2_better = []
    both_equal = []
    
    for i in range(len(recommendation_ranks_1)):
        if recommendation_ranks_1[i] < recommendation_ranks_2[i] - tolerance:
            rec_1_better.append(i)
        elif recommendation_ranks_2[i] < recommendation_ranks_1[i] - tolerance:
            rec_2_better.append(i)
        else:
            both_equal.append(i)
    return rec_1_better, rec_2_better, both_equal

def get_gt_hits_at_k(k, recommendation_ranks):
    hits_at_k = []
    for i, rank in enumerate(recommendation_ranks):
        if rank <= k:
            hits_at_k.append(i)
    return hits_at_k


## Get Measure for examplary recipe

In [99]:
recipe_ids = ["a21302e305"]
recipe_id = recipe_ids[0]

recip = getRecipeFromComments(recipe_id, extended_recipes)

bayes_naive_recommendations = getNaiveBayesRecommendations(recipe_ids, extended_recipes, recipe_ingredient_counts, recipe_ingredient_pair_counts)

direct_source_mutual_info_recommendations, cosine_similarity_role_recommendations, euclidean_similarity_role_recommendations, manhatten_similarity_role_recommendations = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict)

_, cosine_similarity_role_recommendations_sq, euclidean_similarity_role_recommendations_sq, manhatten_similarity_role_recommendations_sq = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="dampen_square")

_, cosine_similarity_role_recommendations_minimax, euclidean_similarity_role_recommendations_minimax, manhatten_similarity_role_recommendations_minimax = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="minmax")

_, _2, euclidean_similarity_role_recommendations_ind_squared, manhatten_similarity_role_recommendations_ind_squared = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization=None, limiter_indiv_ingr_distance="squared")

print(f"gt_sub = ({recip['subs']})  direct_mi = {list(direct_source_mutual_info_recommendations[recipe_ids[0]].items())[:15]}")

print(cosine_similarity_role_recommendations[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations[recipe_ids[0]][:15])
print("===")
print(cosine_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print("===")
print(cosine_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print("===")
print(euclidean_similarity_role_recommendations_ind_squared[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_ind_squared[recipe_ids[0]][:15])

print("test done")

gt_sub = ([('all_purpose_flour', 'ground_almond'), ('slivered_almond', 'vanilla_extract')])  direct_mi = [('sesame_seed', 0.0005614077755832065), ('almond_extract', 0.0005543576357978351), ('dried_cranberry', 0.0005173229393879747), ('sunflower_seed', 0.0004646666895138482), ('green_onion', 0.000387135342628955), ('golden_raisin', 0.00036371327039129734), ('mayonnaise', 0.0003435368202518623), ('raisin', 0.00033235923284172415), ('mandarin_orange', 0.0003275299028412627), ('honey', 0.00028803808116346485), ('potato', 0.0002851576348480364), ('wheat_germ', 0.0002800274747300018), ('chicken_flavored_ramen_noodle', 0.0002798483709266614), ('couscous', 0.00027772200881617153), ('coleslaw', 0.00026976525704042274)]
[('slivered_almond', 0.9999999999999999), ('fresh_cherry', 0.9878396139150007), ('cookie_cutter', 0.9819123568814901), ('peach_pie_filling', 0.9802743916725406), ('cinnamon_roll', 0.9768896656617706), ('almond', 0.9727307419359852), ('non_dairy_milk_substitute', 0.970764482303189

In [100]:
source_ingredients = {recipe_id: recommendations[0][0] for recipe_id, recommendations in cosine_similarity_role_recommendations.items()}

gt_ranks_mi_direct = getRanksFromDirectRecommendations(source_ingredients, direct_source_mutual_info_recommendations, extended_recipes, 0)
gt_ranks_mi_cosine_role = getRanksFromRecommendationTuples(source_ingredients, cosine_similarity_role_recommendations, extended_recipes, -1)
gt_ranks_mi_euclid_role = getRanksFromRecommendationTuples(source_ingredients, euclidean_similarity_role_recommendations, extended_recipes, -1)
gt_ranks_mi_manhat_role = getRanksFromRecommendationTuples(source_ingredients, manhatten_similarity_role_recommendations, extended_recipes, -1)

direct_better_than_cosine, cosine_better_than_direct, cosine_equal_direct = compare_recommendation_ranks(gt_ranks_mi_direct, gt_ranks_mi_cosine_role, tolerance=15)
direct_better_than_euclid, euclid_better_than_direct, euclid_equal_direct = compare_recommendation_ranks(gt_ranks_mi_direct, gt_ranks_mi_euclid_role, tolerance=15)
direct_better_than_manhat, manhat_better_than_direct, manhat_equal_direct = compare_recommendation_ranks(gt_ranks_mi_direct, gt_ranks_mi_manhat_role, tolerance=15)

hits_at_15_direct = get_gt_hits_at_k(15, gt_ranks_mi_direct)
hits_at_15_cosine = get_gt_hits_at_k(15, gt_ranks_mi_cosine_role)
hits_at_15_euclid = get_gt_hits_at_k(15, gt_ranks_mi_euclid_role)
hits_at_15_manhat = get_gt_hits_at_k(15, gt_ranks_mi_manhat_role)


## Compare GT Ranks for several recipes

In [101]:
ingredients = get_graph_nodes(GRAPH_NODES_PATH)
recipe_ids = []
for i, recipe_id in enumerate(list(recipes_extended_dict.keys())):
    if i % 500 == 0:
        recipe = getRecipeFromComments(recipe_id, extended_recipes)
        recipe_ids.append(recipe_id)

In [102]:

direct_source_mutual_info_recommendations, cosine_similarity_role_recommendations, euclidean_similarity_role_recommendations, manhatten_similarity_role_recommendations = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict)

source_ingredients = {recipe_id: recommendations[0][0] for recipe_id, recommendations in cosine_similarity_role_recommendations.items()}

gt_ranks_direct_mi = []
gt_ranks_mi_cosine_role = []
gt_ranks_mi_euclid_role = []
gt_ranks_mi_manhat_role = []

# for recipe_id, recommendations in direct_source_mutual_info_recommendations.items():
#     recipe = getRecipeFromComments(recipe_id, extended_recipes)
#     gt_subs = [sub[1] for sub in  recipe["subs"]]
#     for i, ingredient in enumerate(list(recommendations.keys())):
#         if ingredient in gt_subs:
#             gt_ranks_direct_mi.append(i)
#             continue

gt_ranks_direct_mi = getRanksFromDirectRecommendations(source_ingredients, direct_source_mutual_info_recommendations, extended_recipes, 0)
gt_ranks_mi_cosine_role = getRanksFromRecommendationTuples(source_ingredients, cosine_similarity_role_recommendations, extended_recipes)
gt_ranks_mi_euclid_role = getRanksFromRecommendationTuples(source_ingredients, euclidean_similarity_role_recommendations, extended_recipes)
gt_ranks_mi_manhat_role = getRanksFromRecommendationTuples(source_ingredients, manhatten_similarity_role_recommendations, extended_recipes)

print(gt_ranks_direct_mi)
print(gt_ranks_mi_cosine_role)
print(gt_ranks_mi_euclid_role)
print(gt_ranks_mi_manhat_role)



[143, 8, 4279, 183, 1911, 57, 10, 1849, 12, 2293, 1081, 206, 142, 869, 17, 425, 3546, 3686, 267, 50, 1206, 1581, 14, 1376, 2, 66, 1030, 872, 736, 73, 186, 167, 1582, 41, 3248, 449, 425, 142, 1353, 101, 405, 7, 6, 563, 3129, 4015, 239, 1140, 5, 84, 39, 787, 1581, 1944, 358, 2309, 412, 239, 174, 37, 996, 661, 813, 109, 5523, 65, 5197, 2150, 186, 174, 65, 1656, 1042, 22, 6005, 1408, 998, 295, 475, 317, 2129, 186, 27, 1384, 169]
[32, 2583, 25, 4980, 75, 2730, 4637, 4301, 160, 11, 514, 43, 788, 1182, 4993, 2675, 1499, 5429, 4263, 416, 688, 1711, 5271, 787, 3144, 1377, 1146, 6652, 164, 5802, 53, 337, 139, 4732, 6652, 5194, 4952, 3286, 371, 606, 950, 2134, 4233, 2874, 3757, 2382, 4036, 158, 5533, 730, 65, 5807, 172, 5731, 85, 5448, 4518, 39, 5221, 3340, 5384, 3005, 161, 5585, 1805, 3897, 2852, 2288, 4490, 4971, 229, 6652, 6, 6652, 1728, 5230, 1740, 2262, 6005, 3258, 3205, 121, 1131, 4819, 17, 121, 3421, 317, 3457]
[261, 6444, 54, 6381, 2236, 33, 6494, 1220, 12, 166, 26, 373, 5, 84, 6620, 130,

In [103]:
_, cosine_similarity_role_recommendations_sq_damp, euclidean_similarity_role_recommendations_sq_damp, manhatten_similarity_role_recommendations_sq_damp = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="dampen_square")

_, cosine_similarity_role_recommendations_minimax, euclidean_similarity_role_recommendations_minimax, manhatten_similarity_role_recommendations_minimax = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="minmax")

gt_ranks_mi_cosine_role_sq_damp = getRanksFromRecommendationTuples(source_ingredients, cosine_similarity_role_recommendations_sq_damp, extended_recipes)
gt_ranks_mi_euclid_role_sq_damp = getRanksFromRecommendationTuples(source_ingredients, euclidean_similarity_role_recommendations_sq_damp, extended_recipes)
gt_ranks_mi_manhat_role_sq_damp = getRanksFromRecommendationTuples(source_ingredients, manhatten_similarity_role_recommendations_sq_damp, extended_recipes)
gt_ranks_mi_cosine_role_minimax = getRanksFromRecommendationTuples(source_ingredients, cosine_similarity_role_recommendations_minimax, extended_recipes)
gt_ranks_mi_euclid_role_minimax = getRanksFromRecommendationTuples(source_ingredients, euclidean_similarity_role_recommendations_minimax, extended_recipes)
gt_ranks_mi_manhat_role_minimax = getRanksFromRecommendationTuples(source_ingredients, manhatten_similarity_role_recommendations_minimax, extended_recipes)


print(gt_ranks_mi_cosine_role_sq_damp)
print(gt_ranks_mi_euclid_role_sq_damp)
print(gt_ranks_mi_manhat_role_sq_damp)
print("===")
print(gt_ranks_mi_cosine_role_minimax)
print(gt_ranks_mi_euclid_role_minimax)
print(gt_ranks_mi_manhat_role_minimax)


[56, 2588, 663, 4808, 611, 3348, 4283, 4817, 61, 60, 527, 203, 1562, 1832, 1142, 3369, 2505, 5479, 4268, 160, 765, 1167, 5317, 888, 3185, 1325, 1252, 6652, 206, 5803, 52, 1352, 131, 5203, 6652, 5114, 4989, 3531, 369, 2120, 966, 1317, 4837, 2749, 1806, 1612, 4121, 157, 5440, 1161, 65, 5812, 45, 5756, 74, 5528, 4295, 28, 5260, 3287, 5529, 2766, 132, 5727, 1895, 3830, 2626, 1960, 3176, 4985, 275, 6652, 19, 6652, 2174, 5234, 2889, 2207, 6005, 1007, 3005, 87, 1068, 4591, 72, 519, 3460, 112, 3348]
[255, 6413, 128, 6338, 2332, 48, 6456, 1544, 4, 202, 31, 394, 4, 99, 6613, 181, 4613, 3419, 363, 6615, 6635, 37, 6244, 80, 6628, 6589, 6543, 6652, 6629, 6584, 26, 6623, 6647, 5126, 6652, 6490, 5609, 509, 23, 6, 664, 6637, 6609, 21, 6443, 2231, 3294, 3307, 6458, 301, 4, 6631, 6477, 6436, 0, 2871, 6360, 56, 6289, 6381, 264, 6637, 29, 6566, 6608, 6581, 3001, 491, 3967, 6223, 13, 6652, 6262, 6652, 6368, 2597, 50, 6615, 1981, 2324, 2819, 6477, 6479, 6380, 19, 5, 6635, 2386, 6549]
[664, 6445, 65, 6361, 1

In [104]:
print(gt_ranks_direct_mi)
print("---")
print(gt_ranks_mi_cosine_role)
print(gt_ranks_mi_cosine_role_sq_damp)
print(gt_ranks_mi_cosine_role_minimax)
print("---")
print(gt_ranks_mi_euclid_role)
print(gt_ranks_mi_euclid_role_sq_damp)
print(gt_ranks_mi_euclid_role_minimax)
print("---")
print(gt_ranks_mi_manhat_role)
print(gt_ranks_mi_manhat_role_sq_damp)
print(gt_ranks_mi_manhat_role_minimax)

[143, 8, 4279, 183, 1911, 57, 10, 1849, 12, 2293, 1081, 206, 142, 869, 17, 425, 3546, 3686, 267, 50, 1206, 1581, 14, 1376, 2, 66, 1030, 872, 736, 73, 186, 167, 1582, 41, 3248, 449, 425, 142, 1353, 101, 405, 7, 6, 563, 3129, 4015, 239, 1140, 5, 84, 39, 787, 1581, 1944, 358, 2309, 412, 239, 174, 37, 996, 661, 813, 109, 5523, 65, 5197, 2150, 186, 174, 65, 1656, 1042, 22, 6005, 1408, 998, 295, 475, 317, 2129, 186, 27, 1384, 169]
---
[32, 2583, 25, 4980, 75, 2730, 4637, 4301, 160, 11, 514, 43, 788, 1182, 4993, 2675, 1499, 5429, 4263, 416, 688, 1711, 5271, 787, 3144, 1377, 1146, 6652, 164, 5802, 53, 337, 139, 4732, 6652, 5194, 4952, 3286, 371, 606, 950, 2134, 4233, 2874, 3757, 2382, 4036, 158, 5533, 730, 65, 5807, 172, 5731, 85, 5448, 4518, 39, 5221, 3340, 5384, 3005, 161, 5585, 1805, 3897, 2852, 2288, 4490, 4971, 229, 6652, 6, 6652, 1728, 5230, 1740, 2262, 6005, 3258, 3205, 121, 1131, 4819, 17, 121, 3421, 317, 3457]
[56, 2588, 663, 4808, 611, 3348, 4283, 4817, 61, 60, 527, 203, 1562, 1832, 

### Compare ranks

- tolerance = 15 as we also use hits@15
- seems like for all metrics, the direct mutual information puts the GT substitutions at a higher rank
- otherwise euclidean distance between the recipe roles seems to do best, but has some really bad gt placements

In [105]:
direct_better_than_cosine, cosine_better_than_direct, same_direct_cosine = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_cosine_role, tolerance=15)
direct_better_than_euclid, euclid_better_than_direct, same_direct_euclid = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_euclid_role, tolerance=15)
direct_better_than_manhat, manhat_better_than_direct, same_direct_manhat = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_manhat_role, tolerance=15)

## square dampened
direct_better_than_cosine_sq_damp, cosine_sq_damp_better_than_direct, same_direct_cosine_sq_damp = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_cosine_role_sq_damp, tolerance=15)
direct_better_than_euclid_sq_damp, euclid_sq_damp_better_than_direct, same_direct_euclid_sq_damp = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_euclid_role_sq_damp, tolerance=15)
direct_better_than_manhat_sq_damp, manhat_sq_damp_better_than_direct, same_direct_manhat_sq_damp = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_manhat_role_sq_damp, tolerance=15)

# minimax normalized
direct_better_than_cosine_minimax, cosine_minimax_better_than_direct, same_direct_cosine_minimax = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_cosine_role_minimax, tolerance=15)
direct_better_than_euclid_minimax, euclid_minimax_better_than_direct, same_direct_euclid_minimax = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_euclid_role_minimax, tolerance=15)
direct_better_than_manhat_minimax, manhat_minimax_better_than_direct, same_direct_manhat_minimax = compare_recommendation_ranks(gt_ranks_direct_mi, gt_ranks_mi_manhat_role_minimax, tolerance=15)

print("method to compare to dircet ranks || nr times direct better || nr times other method better || nr times roughly equal")
print(f"cosine || {len(direct_better_than_cosine)} || {len(cosine_better_than_direct)} || {len(same_direct_cosine)}")
print(f"cosine_sq_damp || {len(direct_better_than_cosine_sq_damp)} || {len(cosine_sq_damp_better_than_direct)} || {len(same_direct_cosine_sq_damp)}")
print(f"cosine_minimax || {len(direct_better_than_cosine_minimax)} || {len(cosine_minimax_better_than_direct)} || {len(same_direct_cosine_minimax)}")
print(f"euclid || {len(direct_better_than_euclid)} || {len(euclid_better_than_direct)} || {len(same_direct_euclid)}")
print(f"euclid_sq_damp || {len(direct_better_than_euclid_sq_damp)} || {len(euclid_sq_damp_better_than_direct)} || {len(same_direct_euclid_sq_damp)}")
print(f"euclid_minimax || {len(direct_better_than_euclid_minimax)} || {len(euclid_minimax_better_than_direct)} || {len(same_direct_euclid_minimax)}")
print(f"manhatten || {len(direct_better_than_manhat)} || {len(manhat_better_than_direct)} || {len(same_direct_manhat)}")
print(f"manhatten_sq_damp || {len(direct_better_than_manhat_sq_damp)} || {len(manhat_sq_damp_better_than_direct)} || {len(same_direct_manhat_sq_damp)}")
print(f"manhatten_minimax || {len(direct_better_than_manhat_minimax)} || {len(manhat_minimax_better_than_direct)} || {len(same_direct_manhat_minimax)}")

method to compare to dircet ranks || nr times direct better || nr times other method better || nr times roughly equal
cosine || 60 || 25 || 0
cosine_sq_damp || 57 || 27 || 1
cosine_minimax || 60 || 25 || 0
euclid || 55 || 29 || 1
euclid_sq_damp || 58 || 25 || 2
euclid_minimax || 55 || 29 || 1
manhatten || 57 || 26 || 2
manhatten_sq_damp || 61 || 23 || 1
manhatten_minimax || 57 || 26 || 2


In [106]:
rank_differences_direct_cosine = []
for direct_rank, role_rank in zip(gt_ranks_direct_mi, gt_ranks_mi_cosine_role):
    rank_differences_direct_cosine.append(direct_rank - role_rank)
print(rank_differences_direct_cosine)
print(sum(rank_differences_direct_cosine) / len(rank_differences_direct_cosine))

rank_differences_direct_euclid_minimax = []
for direct_rank, role_rank in zip(gt_ranks_direct_mi, gt_ranks_mi_euclid_role_minimax):
    rank_differences_direct_euclid_minimax.append(direct_rank - role_rank)
print(rank_differences_direct_euclid_minimax)
print(sum(rank_differences_direct_euclid_minimax) / len(rank_differences_direct_euclid_minimax))

rank_differences_direct_manhat_minimax = []
for direct_rank, role_rank in zip(gt_ranks_direct_mi, gt_ranks_mi_manhat_role_minimax):
    rank_differences_direct_manhat_minimax.append(direct_rank - role_rank)
print(rank_differences_direct_manhat_minimax)
print(sum(rank_differences_direct_manhat_minimax) / len(rank_differences_direct_manhat_minimax))

# average rank is worse for euclid than for cosine though

[111, -2575, 4254, -4797, 1836, -2673, -4627, -2452, -148, 2282, 567, 163, -646, -313, -4976, -2250, 2047, -1743, -3996, -366, 518, -130, -5257, 589, -3142, -1311, -116, -5780, 572, -5729, 133, -170, 1443, -4691, -3404, -4745, -4527, -3144, 982, -505, -545, -2127, -4227, -2311, -628, 1633, -3797, 982, -5528, -646, -26, -5020, 1409, -3787, 273, -3139, -4106, 200, -5047, -3303, -4388, -2344, 652, -5476, 3718, -3832, 2345, -138, -4304, -4797, -164, -4996, 1036, -6630, 4277, -3822, -742, -1967, -5530, -2941, -1076, 65, -1104, -3435, 152]
-1692.9058823529413
[-118, -6436, 4225, -6198, -325, 24, -6484, 629, 0, 2127, 1055, -167, 137, 785, -6603, 295, -1444, 883, -116, -6573, -5430, 1496, -6239, 1287, -6633, -6525, -5502, -5780, -5891, -6525, 154, 137, -5064, -2138, -3404, -6042, -5094, -316, 1342, 94, -293, -6630, -5769, 530, -3316, 1885, -3143, -1928, -6437, -112, 36, -5845, 384, -4489, 358, -270, -2500, 187, -2446, -6324, 830, -5978, 804, -6468, -1083, -6517, 1591, 1742, -4049, -6056, 57, -

### get hits@15

In [107]:
hits_at_15_direct = get_gt_hits_at_k(15, gt_ranks_direct_mi)
hits_at_15_cosine = get_gt_hits_at_k(15, gt_ranks_mi_cosine_role)
hits_at_15_euclid = get_gt_hits_at_k(15, gt_ranks_mi_euclid_role)
hits_at_15_manhat = get_gt_hits_at_k(15, gt_ranks_mi_manhat_role)
hits_at_15_cos_sq_damp = get_gt_hits_at_k(15, gt_ranks_mi_cosine_role_sq_damp)
hits_at_15_cos_minimax = get_gt_hits_at_k(15, gt_ranks_mi_cosine_role_minimax)
hits_at_15_euclid_sq_damp = get_gt_hits_at_k(15, gt_ranks_mi_euclid_role_sq_damp)
hits_at_15_euclid_minimax = get_gt_hits_at_k(15, gt_ranks_mi_euclid_role_minimax)
hits_at_15_manhat_sq_damp = get_gt_hits_at_k(15, gt_ranks_mi_manhat_role_sq_damp)
hits_at_15_manhat_minimax = get_gt_hits_at_k(15, gt_ranks_mi_manhat_role_minimax)

print(hits_at_15_direct)
print("===")
print(hits_at_15_cosine)
print(hits_at_15_cos_sq_damp)
print(hits_at_15_cos_minimax)
print("===")
print(hits_at_15_euclid)
print(hits_at_15_euclid_sq_damp)
print(hits_at_15_euclid_minimax)
print("===")
print(hits_at_15_manhat)
print(hits_at_15_manhat_sq_damp)
print(hits_at_15_manhat_minimax)


[1, 6, 8, 22, 24, 41, 42, 48]
===
[9, 72]
[]
[9, 72]
===
[8, 12, 38, 39, 50, 54, 62, 70, 84, 85]
[8, 12, 39, 50, 54, 70, 85]
[8, 12, 38, 39, 50, 54, 62, 70, 84, 85]
===
[12, 39, 54, 62, 70, 76, 84, 85]
[8, 12, 39, 54, 70, 76, 84, 85]
[12, 39, 54, 62, 70, 76, 84, 85]


In [None]:
# safe all the things
all_recommendations = {
    "direct_source_mutual_info_recommendations": direct_source_mutual_info_recommendations,
    "cosine_similarity_role_recommendations": cosine_similarity_role_recommendations,
    "euclidean_similarity_role_recommendations": euclidean_similarity_role_recommendations,
    "manhatten_similarity_role_recommendations": manhatten_similarity_role_recommendations,
    "cosine_similarity_role_recommendations_sq_damp": cosine_similarity_role_recommendations_sq_damp,
    "euclidean_similarity_role_recommendations_sq_damp": euclidean_similarity_role_recommendations_sq_damp,
    "manhatten_similarity_role_recommendations_sq_damp": manhatten_similarity_role_recommendations_sq_damp,
    "cosine_similarity_role_recommendations_minimax": cosine_similarity_role_recommendations_minimax,
    "euclidean_similarity_role_recommendations_minimax": euclidean_similarity_role_recommendations_minimax,
    "manhatten_similarity_role_recommendations_minimax": manhatten_similarity_role_recommendations_minimax
}
all_recommendations_path = os.path.abspath("./outputs/eval_all_alt_recommendations_every_1th.pkl")
with open(all_recommendations_path, "wb") as file:
    pickle.dump(all_recommendations, file)



In [None]:
# make some df to get a nice view over recommendations

short_col_names = {
    "direct_source_mutual_info_recommendations": "direct",
    "cosine_similarity_role_recommendations": "cosine",
    "euclidean_similarity_role_recommendations": "euclidean",
    "manhatten_similarity_role_recommendations": "manhatten",
    "cosine_similarity_role_recommendations_sq_damp": "cosine_sq_damp",
    "euclidean_similarity_role_recommendations_sq_damp": "euclidean_sq_damp",
    "manhatten_similarity_role_recommendations_sq_damp": "manhatten_sq_damp",
    "cosine_similarity_role_recommendations_minimax": "cosine_minimax",
    "euclidean_similarity_role_recommendations_minimax": "euclidean_minimax",
    "manhatten_similarity_role_recommendations_minimax": "manhatten_minimax"
}

recipe_id = recipe_ids[6]

transposed_data = []
column_names = []
for recommendation_type, recommendations in list(all_recommendations.items()):
    # if recommendation_type == "direct_source_mutual_info_recommendations":
    #     continue
    if recommendation_type == "direct_source_mutual_info_recommendations":
        data = list(recommendations[recipe_id].keys())
    else:
        data = [rec[0] for rec in recommendations[recipe_id]]
        
    transposed_data.append(data)
    column_names.append(short_col_names[recommendation_type])
    
all_recipe_recs_df = pd.DataFrame(data=transposed_data).T
all_recipe_recs_df.columns = column_names

recipe = getRecipeFromComments(recipe_id, extended_recipes)
print(f"recipe ingredients {recipe['ingredients']}")
print(f"substitutions: {recipe['subs']}")
print(all_recipe_recs_df.head(15))

recipe ingredients [['brown_sugar'], ['powdered_sugar'], ['vanilla_extract', 'butterscotch_extract'], ['low_fat_cream_cheese', 'reduced_fat_cream_cheese'], ['toffee_piece', 'toffee_pieces'], ['pineapple_juice'], ['red_delicious_apple', 'red_delicious_apples'], ['granny_smith_apple', 'granny_smith_apples']]
substitutions: [('low_fat_cream_cheese', 'neufchatel_cheese'), ('pineapple_juice', 'orange_juice')]
                            direct                                 cosine  \
0                  pineapple_chunk                        pineapple_juice   
1                              ice                                 raisin   
2                        soy_sauce                                 peanut   
3                        grenadine                             applesauce   
4                      coconut_rum                                    oat   
5                        pineapple                             rolled_oat   
6                  pineapple_slice  ghirardelli_semi_

In [13]:
# num_cosine_better = 0
# num_euclid_better = 0
# num_manhat_better = 0
# for i in range(len(gt_ranks_direct_mi)):
#     if (gt_ranks_direct_mi[i] > gt_ranks_mi_cosine_role[i]):
#         num_cosine_better += 1
#     if (gt_ranks_direct_mi[i] > gt_ranks_mi_euclid_role[i]):
#         num_euclid_better += 1
#     if (gt_ranks_direct_mi[i] > gt_ranks_mi_manhat_role[i]):
#         num_manhat_better += 1
        
# print(f"total predictions: {len(gt_ranks_direct_mi)}\nnum cosine better: {num_cosine_better}\nnum euclid better: {num_euclid_better}\nnum manhat better: {num_manhat_better}")

total predictions: 10
num cosine better: 6
num euclid better: 7
num manhat better: 4


### Check how frequent the gt sources of the recipes and the other recipe ingredients appear in the dataset

In [36]:
source_ingredients = source_ingredients # all source ingredients that were used to find recommendations
used_recipes = []

for recipe_id in recipe_ids:
    used_recipes.append(getRecipeFromComments(recipe_id, extended_recipes))

SyntaxError: incomplete input (4138481159.py, line 4)

In [128]:
indices_for_rank_worse_than_4k = [i for i, value in enumerate(gt_ranks_mi_euclid_role) if value > 4000]
indices_for_rank_worse_than_4k

recipe_ids_for_rank_worse_than_4k = [recipe_ids[i] for i in indices_for_rank_worse_than_4k]
recipe_ids_for_rank_worse_than_4k

for recipe_id in recipe_ids_for_rank_worse_than_4k:
    print(f"recipe id: {recipe_id} source ingredient frequency: {source_ingredients[recipe_id]}: {len(recipes_per_ingredient[source_ingredients[recipe_id]])}")
    recipe_ingredients = getRecipeFromComments(recipe_id, extended_recipes)["ingredients"]
    ingredient_frequencies = {}
    for recipe_ingredient in recipe_ingredients:
        for ingredient_variant in recipe_ingredient:
            if ingredient_variant in list(recipes_per_ingredient.keys()):
                ingredient_frequencies[ingredient_variant] = len(recipes_per_ingredient[ingredient_variant])
                break
    print(f"Recipe ingredient frequencies: {ingredient_frequencies}")
    
    print("---")

recipe id: 0609ee46a6 source ingredient frequency: cold_water: 476
Recipe ingredient frequencies: {'bisquick_baking_mix': 54, 'cold_water': 476, 'sharp_cheddar_cheese': 512, 'butter': 11274, 'dry_parsley_flake': 964, 'garlic_powder': 2392, 'italian_seasoning': 458}
---
recipe id: 11911fb3d0 source ingredient frequency: maraschino_cherry: 132
Recipe ingredient frequencies: {'all_purpose_flour': 4061, 'baking_soda': 4119, 'butter': 11274, 'white_sugar': 806, 'brown_sugar': 4141, 'instant_pistachio_pudding_mix': 15, 'egg': 11024, 'almond_extract': 425, 'green_food_coloring': 48, 'semisweet_chocolate_chip': 1, 'maraschino_cherry': 132, 'pistachio': 57}
---
recipe id: 225750a2c3 source ingredient frequency: pineapple_juice: 209
Recipe ingredient frequencies: {'brown_sugar': 4141, 'powdered_sugar': 930, 'vanilla_extract': 2135, 'low_fat_cream_cheese': 66, 'toffee_piece': 27, 'pineapple_juice': 209, 'red_delicious_apple': 25, 'granny_smith_apple': 198}
---
recipe id: 5209b68235 source ingredi

In [129]:
indices_for_rank_better_than_250 = [i for i, value in enumerate(gt_ranks_mi_euclid_role) if value < 250]
indices_for_rank_better_than_250

recipe_ids_for_rank_better_than_250 = [recipe_ids[i] for i in indices_for_rank_better_than_250]
recipe_ids_for_rank_better_than_250

for recipe_id in recipe_ids_for_rank_better_than_250:
    print(f"recipe id: {recipe_id} source ingredient frequency: {source_ingredients[recipe_id]}: {len(recipes_per_ingredient[source_ingredients[recipe_id]])}")
    recipe_ingredients = getRecipeFromComments(recipe_id, extended_recipes)["ingredients"]
    ingredient_frequencies = {}
    for recipe_ingredient in recipe_ingredients:
        for ingredient_variant in recipe_ingredient:
            if ingredient_variant in list(recipes_per_ingredient.keys()):
                ingredient_frequencies[ingredient_variant] = len(recipes_per_ingredient[ingredient_variant])
                break
    print(f"Recipe ingredient frequencies: {ingredient_frequencies}")
    
    print("---")

recipe id: 0bdfe50be7 source ingredient frequency: feta_cheese: 492
Recipe ingredient frequencies: {'vegetable_oil': 3413, 'warm_water': 356, 'salt': 18957, 'flour': 6114, 'egg_yolk': 559, 'parmesan_cheese': 3244, 'eggplant': 252, 'feta_cheese': 492, 'salt_and_pepper': 3234}
---
recipe id: 1cbef47545 source ingredient frequency: sugar: 9198
Recipe ingredient frequencies: {'low_fat_graham_cracker': 2, 'smart_balance_butter_spread': 10, 'water': 7849, 'cold_water': 476, 'unflavored_gelatin': 50, 'sugar': 9198, 'salt': 18957, 'all_purpose_flour': 4061, 'egg_yolk': 559, 'nonfat_milk': 192, 'vanilla_extract': 2135, 'banana_liqueur': 9, 'dark_rum': 107, 'banana': 1120}
---
recipe id: 2e4b665146 source ingredient frequency: cherry_tomato: 279
Recipe ingredient frequencies: {'green_bean': 406, 'feta_cheese': 492, 'olive_oil': 6240, 'garlic_clove': 8942, 'kalamata_olive': 191, 'cherry_tomato': 279, 'fresh_lemon_juice': 953, 'fresh_oregano': 153, 'salt_and_pepper': 3234}
---
recipe id: 3448ed4a5

## Some more dataset verifications

In [None]:

# ingredients = get_graph_nodes(GRAPH_NODES_PATH)
ingredients = list(mutual_info_dict.keys())

forgotten_ingredients = []
not_forgotten_ingredients = []
for recipe in extended_recipes:
    for ingredient_list in recipe["ingredients"]:
        ingredient = ingredient_list[0]
        # if ingredient in ingredients:
        #     print("y")
        # else:
        #     print("n")
        if (ingredient not in ingredients) and (ingredient not in forgotten_ingredients):
            forgotten_ingredients.append(ingredient)
        if (ingredient in ingredients) and (ingredient not in not_forgotten_ingredients):
            not_forgotten_ingredients.append(ingredient)
            
print(f"forgotten ingredients: {forgotten_ingredients}")


forgotten ingredients: []


In [33]:
forgotten_inner_ingredients = []
all_ingredients = get_graph_nodes(GRAPH_NODES_PATH)
for ingredient in all_ingredients:
    if ingredient not in list(list(mutual_info_dict.items())[0][1].keys()):
        forgotten_inner_ingredients.append(ingredient)

# a = "dried_parsley" in list(list(mutual_info_dict.items())[0][1].keys())
# # print(list(list(mutual_info_dict.items())[0][1].keys()))
# print(a)
print(forgotten_inner_ingredients)
print("dried_parsley" in all_ingredients)

[]
False


### some of the source ingredients are not found in the nodes list ?

In [38]:
all_gt_sources = [recipe["subs"][0] for recipe in extended_recipes]
# print(all_gt_sources)
# print("dried_parsley" in all_gt_sources)

gt_sources_which_are_not_nodes = []
for gt_source in all_gt_sources:
    if gt_source not in all_ingredients:
        gt_sources_which_are_not_nodes.append(gt_source)
        
print(gt_sources_which_are_not_nodes)
print(len(gt_sources_which_are_not_nodes))
print(len(all_gt_sources))


['frozen_chopped_broccoli', 'chicken_broth_with_roasted_garlic', 'panini_bread', 'instant_chocolate_fudge_pudding', 'vegetable_salt', 'cream_filled_chocolate_sandwich_cookies', 'lemon_infused_olive_oil', 'dried_parsley', 'artificial_sweetener', 'kaiser_rolls', 'turkey_bacon', 'fat_free_sugar_free_instant_chocolate_pudding_mix', 'vanilla_butternut_flavoring', 'butterscotch_extract', 'dried_rosemary', 'dried_parsley', 'full_cream_milk', 'whiskey', 'sugar_free_fat_free_butterscotch_pudding', 'ground_red_pepper', 'frozen_chopped_broccoli', 'mrs._dash_tomato_basil_garlic_seasoning', 'bread_enhancer', 'sugar_free_lime_gelatin', 'low_fat_mayonnaise', 'dry_crushed_red_pepper', 'chocolate_peanut_butter', 'garlic_powder_with_parsley', 'dried_rosemary', 'dried_rosemary', 'salt_free_garlic_powder', 'chives', 'light_sour_cream', 'cheese_sauce_mix', 'butterscotch_extract', 'black_bean_salsa', 'ground_sirloin', 'butterscotch_extract', 'ground_red_pepper', 'white_flour', 'farfel', 'fresh_mushrooms', '

In [11]:
# get some meat
ingo_ids = []
for recipe_id, recipe in recipes_extended_dict.items():
    if recipe["subs"][0] == "ground_beef":
        ingo_ids.append(recipe_id)
    
print(ingo_ids)

['00b2b2edbd', '0177c1bb48', '0271aa0503', '036bdeca88', '04911b2bed', '07122400ca', '08773418c4', '08c66bdc32', '0a8fac8be4', '0c1ae64700', '0e03cf19a1', '0e2c46410e', '0f4dd0863e', '11098cc0d3', '1181fe4911', '124b93847e', '14532143af', '18928fe2cd', '18c95e9bfc', '191c232acf', '19222783ad', '1a7d728eeb', '1b4d3440a8', '1d1fadfcf2', '1fef38437a', '22d85701e0', '24ebf71910', '2606a45b70', '27e5c0162f', '27f3cd87b0', '28cf00b64a', '28dca771e9', '2b33877066', '2bef80c126', '2c3c533343', '2c65cf040b', '2da5b27ec7', '2ee15e022a', '3081b3b287', '30ee1343c5', '312be3dd62', '35cc01fce2', '3686367e00', '36a3fd1c9e', '373fdfbb8b', '37c196e828', '3a30436622', '3c663079fd', '3cdd09657a', '3d1ced3465', '3e686bbea6', '40e80d2208', '411cf7f936', '41622e9c80', '41c1dd6b32', '4286ba8bbb', '431c06de39', '4763278509', '496a95edd6', '4a6de23968', '4b2d936ea1', '4b656594ae', '4b9fbc8c46', '4e3efb8986', '4e4250b43e', '505d036de1', '55033cea2d', '55356264dc', '55378c4e31', '56425be165', '56be0aee75', '5814

## Some more qualitative recommendation evaluations

### Some meaty recipe

In [135]:
recipe_ids = ["00b2b2edbd"]

recip = getRecipeFromComments(recipe_ids[0], extended_recipes)

bayes_naive_recommendations = getNaiveBayesRecommendations(recipe_ids, extended_recipes, recipe_ingredient_counts, recipe_ingredient_pair_counts)

direct_source_mutual_info_recommendations, cosine_similarity_role_recommendations, euclidean_similarity_role_recommendations, manhatten_similarity_role_recommendations = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict)

_, cosine_similarity_role_recommendations_sq, euclidean_similarity_role_recommendations_sq, manhatten_similarity_role_recommendations_sq = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="dampen_square")

_, cosine_similarity_role_recommendations_minimax, euclidean_similarity_role_recommendations_minimax, manhatten_similarity_role_recommendations_minimax = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization="minmax")

_, _2, euclidean_similarity_role_recommendations_ind_squared, manhatten_similarity_role_recommendations_ind_squared = getRecommendationsBasedOnMutualInformationRole(recipe_ids, extended_recipes, mutual_info_dict, normalization=None, limiter_indiv_ingr_distance="factor")

print(f"gt_sub = ({recip['subs']})  ")
print(f"recipe_ingredients: {recip['ingredients']}")
print("===")
print(f"direct_mi = {list(direct_source_mutual_info_recommendations[recipe_ids[0]].items())[:15]}")
print("===")
print(cosine_similarity_role_recommendations[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations[recipe_ids[0]][:15])
print("===")
print(cosine_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_sq[recipe_ids[0]][:15])
print("===")
print(cosine_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print(euclidean_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_minimax[recipe_ids[0]][:15])
print("===")
print(euclidean_similarity_role_recommendations_ind_squared[recipe_ids[0]][:15])
print(manhatten_similarity_role_recommendations_ind_squared[recipe_ids[0]][:15])


gt_sub = ([('oregano', 'seasoning'), ('ground_beef', 'ground_turkey'), ('ground_beef', 'ground_sausage'), ('ground_beef', 'parmesan_cheese'), ('mild_cheddar_cheese', 'italian_seasoning'), ('ground_beef', 'turkey_sausage'), ('ground_beef', 'turkey')])  
recipe_ingredients: [['thin_spaghetti'], ['ground_beef'], ['garlic', 'garlic_sprouts'], ['spaghetti_sauce', 'garden_vegetable_spaghetti_sauce'], ['mild_cheddar_cheese', 'caerphilly_cheese'], ['oregano']]
===
direct_mi = [('onion', 0.014479357808449611), ('tomato_sauce', 0.008368351296412911), ('pepper', 0.004044852631508548), ('ketchup', 0.003928400517292704), ('green_pepper', 0.0037516395648175514), ('baking_powder', 0.0033971881356922454), ('taco_seasoning', 0.003339181988187582), ('chili_powder', 0.0033094928231676216), ('worcestershire_sauce', 0.003274712699829152), ('baking_soda', 0.0031863102241034783), ('mozzarella_cheese', 0.003107582660396964), ('shredded_cheddar_cheese', 0.0028600424422341383), ('vanilla', 0.0026285105281725132