In [2]:
import pandas as pd 
from itertools import combinations
from collections import Counter
import numpy as np
import model_functions as mf

In [3]:
df = pd.read_pickle("../data/clean_food.pkl")
df.head()



Unnamed: 0,id,ingredients
0,10259,"[romaine lettuce, black olives, grape, black p..."
1,25693,"[black pepper, black pepper, thyme, egg, yello..."
2,20130,"[egg, black pepper, mayonnaise, green chili pe..."
4,13162,"[black pepper, shallot, cayenne pepper, onion,..."
5,6602,"[egg, ginger root, cinnamon, milk, vanilla, gi..."


In [4]:
df['pairs'] = df['ingredients'].apply(lambda x: list(combinations(x,2)))
df.head()

Unnamed: 0,id,ingredients,pairs
0,10259,"[romaine lettuce, black olives, grape, black p...","[(romaine lettuce, black olives), (romaine let..."
1,25693,"[black pepper, black pepper, thyme, egg, yello...","[(black pepper, black pepper), (black pepper, ..."
2,20130,"[egg, black pepper, mayonnaise, green chili pe...","[(egg, black pepper), (egg, mayonnaise), (egg,..."
4,13162,"[black pepper, shallot, cayenne pepper, onion,...","[(black pepper, shallot), (black pepper, cayen..."
5,6602,"[egg, ginger root, cinnamon, milk, vanilla, gi...","[(egg, ginger root), (egg, cinnamon), (egg, mi..."


In [5]:
pair_counts = Counter()
for pairs_list in df['pairs']:
    pair_counts.update(pairs_list)
pair_counts

Counter({('onion', 'black pepper'): 286738,
         ('egg', 'vanilla'): 255651,
         ('olives', 'black pepper'): 203950,
         ('cloves', 'black pepper'): 189303,
         ('onion', 'cloves'): 160204,
         ('egg', 'milk'): 150548,
         ('olives', 'cloves'): 131420,
         ('egg', 'black pepper'): 108875,
         ('milk', 'egg'): 94542,
         ('olives', 'onion'): 91720,
         ('milk', 'vanilla'): 89761,
         ('egg', 'cinnamon'): 87940,
         ('cloves', 'olives'): 82094,
         ('onion', 'parsley'): 81385,
         ('black pepper', 'onion'): 76801,
         ('black pepper', 'olives'): 76590,
         ('beef', 'onion'): 75375,
         ('onion', 'egg'): 75118,
         ('parsley', 'black pepper'): 71837,
         ('chicken', 'black pepper'): 68921,
         ('onion', 'celery'): 66857,
         ('black pepper', 'parsley'): 63079,
         ('lemon juice', 'black pepper'): 63062,
         ('milk', 'black pepper'): 62637,
         ('celery', 'black pepper'): 

In [6]:
unique_ingredients = set()
for ingredient in df['ingredients']:
    unique_ingredients.update(ingredient)
unique_ingredients = sorted(list(unique_ingredients))

cooccurrence_matrix = np.zeros((len(unique_ingredients), len(unique_ingredients)))
ingredient_to_index = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}
for ingredients in df['ingredients']:
    for i, ingredient1 in enumerate(ingredients):
        for j, ingredient2 in enumerate(ingredients):
            if i != j:
                index1 = ingredient_to_index[ingredient1]
                index2 = ingredient_to_index[ingredient2]
                cooccurrence_matrix[index1, index2] += 1

cooccurrence_df = pd.DataFrame(cooccurrence_matrix, index=unique_ingredients, columns=unique_ingredients)
cooccurrence_df


Unnamed: 0,acorn squash,adobo,agave nectar,alfredo,allspice,allspice berry,almond,almond milk,amaretto,american cheese,...,yellow bell pepper,yellow corn,yellow mustard,yellow onion,yellow pepper,yellow squash,yogurt,yukon gold potato,zesty italian dressing,zucchini
acorn squash,8.0,0.0,0.0,0.0,94.0,0.0,37.0,1.0,2.0,4.0,...,9.0,0.0,0.0,75.0,1.0,8.0,20.0,10.0,1.0,51.0
adobo,0.0,84.0,5.0,2.0,44.0,1.0,10.0,1.0,0.0,14.0,...,26.0,5.0,12.0,121.0,5.0,3.0,46.0,8.0,4.0,55.0
agave nectar,0.0,5.0,286.0,0.0,31.0,1.0,413.0,64.0,4.0,0.0,...,6.0,1.0,5.0,14.0,6.0,1.0,35.0,3.0,0.0,33.0
alfredo,0.0,2.0,0.0,56.0,0.0,0.0,14.0,0.0,0.0,12.0,...,13.0,1.0,2.0,50.0,2.0,11.0,0.0,6.0,1.0,87.0
allspice,94.0,44.0,31.0,0.0,486.0,28.0,857.0,34.0,12.0,4.0,...,63.0,11.0,103.0,612.0,18.0,16.0,526.0,55.0,1.0,479.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow squash,8.0,3.0,1.0,11.0,16.0,0.0,47.0,0.0,0.0,43.0,...,140.0,13.0,3.0,213.0,53.0,4.0,44.0,11.0,11.0,2427.0
yogurt,20.0,46.0,35.0,0.0,526.0,11.0,1187.0,39.0,16.0,16.0,...,92.0,16.0,35.0,442.0,56.0,44.0,1266.0,84.0,2.0,691.0
yukon gold potato,10.0,8.0,3.0,6.0,55.0,11.0,25.0,10.0,0.0,8.0,...,39.0,6.0,40.0,492.0,10.0,11.0,84.0,102.0,9.0,108.0
zesty italian dressing,1.0,4.0,0.0,1.0,1.0,0.0,10.0,0.0,0.0,1.0,...,8.0,2.0,3.0,12.0,22.0,11.0,2.0,9.0,16.0,85.0


In [7]:
mf.recommend_ingredients_pairwise('agave nectar', pair_counts)

['vanilla', 'coconut', 'cinnamon', 'egg', 'almond']

In [8]:
sim_pd = mf.calculate_similarity_matrix(cooccurrence_df)
sim_pd

Unnamed: 0,acorn squash,adobo,agave nectar,alfredo,allspice,allspice berry,almond,almond milk,amaretto,american cheese,...,yellow bell pepper,yellow corn,yellow mustard,yellow onion,yellow pepper,yellow squash,yogurt,yukon gold potato,zesty italian dressing,zucchini
acorn squash,1.0,0.744101,0.540423,0.634515,0.768975,0.712328,0.49744,0.45616,0.250976,0.667966,...,0.721716,0.57441,0.710901,0.815047,0.726177,0.799815,0.79918,0.879097,0.552225,0.866471
adobo,0.744101,1.0,0.469013,0.588726,0.556397,0.601475,0.381818,0.275626,0.163328,0.64843,...,0.78525,0.592844,0.719943,0.812409,0.754503,0.747509,0.774795,0.791261,0.676617,0.78772
agave nectar,0.540423,0.469013,1.0,0.316569,0.664371,0.469053,0.781903,0.871989,0.637921,0.323886,...,0.443912,0.451999,0.374959,0.485389,0.431486,0.417005,0.704094,0.458688,0.321046,0.5869
alfredo,0.634515,0.588726,0.316569,1.0,0.407051,0.404983,0.365946,0.22086,0.187643,0.592794,...,0.635448,0.523184,0.53427,0.681295,0.649913,0.699138,0.584659,0.690069,0.70241,0.762888
allspice,0.768975,0.556397,0.664371,0.407051,1.0,0.802957,0.676425,0.632646,0.482493,0.487977,...,0.478713,0.574638,0.530459,0.599598,0.489783,0.531453,0.773142,0.589726,0.404079,0.708173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow squash,0.799815,0.747509,0.417005,0.699138,0.531453,0.521976,0.45497,0.286398,0.226648,0.774527,...,0.766925,0.667492,0.755774,0.748178,0.777757,1.0,0.723369,0.84754,0.654047,0.882071
yogurt,0.79918,0.774795,0.704094,0.584659,0.773142,0.67952,0.696351,0.574381,0.456517,0.646201,...,0.698512,0.683932,0.708711,0.777429,0.698115,0.723369,1.0,0.782738,0.614534,0.834402
yukon gold potato,0.879097,0.791261,0.458688,0.690069,0.589726,0.63605,0.455378,0.29861,0.228616,0.706812,...,0.784346,0.652292,0.748873,0.913536,0.777582,0.84754,0.782738,1.0,0.568084,0.878414
zesty italian dressing,0.552225,0.676617,0.321046,0.70241,0.404079,0.448589,0.336372,0.196823,0.148802,0.601652,...,0.609279,0.486371,0.553377,0.578688,0.739988,0.654047,0.614534,0.568084,1.0,0.68375


In [9]:
def front_end_recommender(item_name, sim_matrix, n_items):
    

IndentationError: expected an indented block (1636095339.py, line 2)

In [None]:
reccomendations = mf.recommend_items(sim_pd, "Squirrel",20)

Item 'Squirrel' not in our dataset
5
Selected Closest word in list: curry


In [None]:
list(reccomendations.index)

['chicken thigh',
 'basmati rice',
 'green peas',
 'chicken',
 'cashew nut',
 'long grain rice',
 'cayenne',
 'vegetables',
 'cayenne pepper',
 'cashew',
 'red lentil',
 'chicken leg',
 'coriander',
 'red chili',
 'turmeric',
 'rice',
 'chicken breast',
 'peas',
 'carrot',
 'paprika']

In [None]:
sim_pd.to_csv('../data/similarity_matrix.csv')

In [None]:
import re

In [None]:
def recommend_items_from_multiple(sim_matrix, item_names, top_n=5):
    """
    Recommends the top_n most similar items based on cosine similarity.

    Args:
        co_matrix_pd (pd.DataFrame): The co-occurrence matrix as a Pandas DataFrame.
        item_name (str): The name of the item for which to find recommendations.
        top_n (int, optional): The number of top recommendations to return. Defaults to 5.

    Returns:
        pd.Series: A Pandas Series containing the top_n most similar items and their similarity scores,
                  sorted in descending order of similarity.  Returns an empty series if the item_name is not found.
    """
    if isinstance(item_names, str): #check if we have one sole element coming in, if it is a string convert to a list
        item_names = [item_names]

    cleaned_items = []
    error_codes = []
    item_list = list(sim_matrix.columns) # create a list of all the items
    for item_name in item_names: # clean every item in the list
        item_name = item_name.lower() #remove capitilazation
        item_test = re.sub(r'[^a-zA-Z0-9]', '', item_name) #remove punctuation
        if item_test not in item_list: #check if our item is in the dataset, if not return the closest result
            # print(f"Item '{item_name}' not in our dataset")
            item_test = mf.find_closest_word(item_name, item_list)
            # print(f"Selected Closest word in list: {item_name}")
            error_codes.append(f"Word {item_name} not in list, returning closest '{item_test}'")
        cleaned_items.append(item_test)

    similarity_scores_list = [sim_matrix[item].drop(cleaned_items, errors='ignore')for item in cleaned_items]
    combined_similarity_scores = pd.concat(similarity_scores_list, axis=1).min(axis=1)
    combined_similarity_scores = combined_similarity_scores.sort_values(ascending=False)

    return combined_similarity_scores.head(top_n), error_codes
    

In [20]:
recommend_items_from_multiple(sim_matrix=sim_pd, item_names=["moose","squirrel", "custard"])

(lemon           0.638646
 anise seed      0.637586
 phyllo          0.637342
 honey           0.634443
 double cream      0.6219
 dtype: object,
 ["Word moose not in list, returning closest 'rose'",
  "Word squirrel not in list, returning closest 'curry'",
  "Word custard not in list, returning closest 'mustard'"])