In [1]:
import pandas as pd 
from itertools import combinations
from collections import Counter
import numpy as np
import utils.model_functions as mf

Load our dataset

In [2]:
df = pd.read_pickle("../../Data/clean_food.pkl")
df.head()



Unnamed: 0,id,ingredients
0,10259,"[romaine lettuce, black olives, grape, black p..."
1,25693,"[black pepper, black pepper, thyme, egg, yello..."
2,20130,"[egg, black pepper, mayonnaise, green chili pe..."
4,13162,"[black pepper, shallot, cayenne pepper, onion,..."
5,6602,"[egg, ginger root, cinnamon, milk, vanilla, gi..."


Create our pairs (will use these to build out our similarity matrix)

In [3]:
df['pairs'] = df['ingredients'].apply(lambda x: list(combinations(x,2)))
df.head()

Unnamed: 0,id,ingredients,pairs
0,10259,"[romaine lettuce, black olives, grape, black p...","[(romaine lettuce, black olives), (romaine let..."
1,25693,"[black pepper, black pepper, thyme, egg, yello...","[(black pepper, black pepper), (black pepper, ..."
2,20130,"[egg, black pepper, mayonnaise, green chili pe...","[(egg, black pepper), (egg, mayonnaise), (egg,..."
4,13162,"[black pepper, shallot, cayenne pepper, onion,...","[(black pepper, shallot), (black pepper, cayen..."
5,6602,"[egg, ginger root, cinnamon, milk, vanilla, gi...","[(egg, ginger root), (egg, cinnamon), (egg, mi..."


Build out our pair counts so we can get a matrix

In [4]:
pair_counts = Counter()
for pairs_list in df['pairs']:
    pair_counts.update(pairs_list)
pair_counts

Counter({('onion', 'black pepper'): 286750,
         ('egg', 'vanilla'): 255669,
         ('olives', 'black pepper'): 203958,
         ('cloves', 'black pepper'): 189311,
         ('egg', 'milk'): 165382,
         ('onion', 'cloves'): 160204,
         ('olives', 'cloves'): 131422,
         ('egg', 'black pepper'): 108883,
         ('milk', 'egg'): 105121,
         ('milk', 'vanilla'): 104064,
         ('olives', 'onion'): 91723,
         ('egg', 'cinnamon'): 87941,
         ('cloves', 'olives'): 82094,
         ('onion', 'parsley'): 81403,
         ('black pepper', 'onion'): 76802,
         ('black pepper', 'olives'): 76594,
         ('beef', 'onion'): 75377,
         ('onion', 'egg'): 75120,
         ('parsley', 'black pepper'): 71847,
         ('milk', 'black pepper'): 70270,
         ('chicken', 'black pepper'): 68959,
         ('onion', 'celery'): 66860,
         ('black pepper', 'parsley'): 63083,
         ('lemon juice', 'black pepper'): 63063,
         ('celery', 'black pepper')

Create our dataframe of how often ingredients are used together

In [5]:
unique_ingredients = set()
for ingredient in df['ingredients']:
    unique_ingredients.update(ingredient)
unique_ingredients = sorted(list(unique_ingredients))

cooccurrence_matrix = np.zeros((len(unique_ingredients), len(unique_ingredients)))
ingredient_to_index = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}
for ingredients in df['ingredients']:
    for i, ingredient1 in enumerate(ingredients):
        for j, ingredient2 in enumerate(ingredients):
            if i != j:
                index1 = ingredient_to_index[ingredient1]
                index2 = ingredient_to_index[ingredient2]
                cooccurrence_matrix[index1, index2] += 1

cooccurrence_df = pd.DataFrame(cooccurrence_matrix, index=unique_ingredients, columns=unique_ingredients)
cooccurrence_df


Unnamed: 0,acorn squash,adobo,agave nectar,alfredo,allspice,allspice berry,almond,almond milk,amaretto,american cheese,...,yellow bell pepper,yellow corn,yellow mustard,yellow onion,yellow pepper,yellow squash,yogurt,yukon gold potato,zesty italian dressing,zucchini
acorn squash,8.0,0.0,0.0,0.0,94.0,0.0,38.0,1.0,2.0,4.0,...,9.0,0.0,0.0,75.0,1.0,8.0,20.0,10.0,1.0,51.0
adobo,0.0,84.0,5.0,2.0,44.0,1.0,10.0,1.0,0.0,14.0,...,26.0,5.0,12.0,121.0,5.0,3.0,46.0,8.0,4.0,55.0
agave nectar,0.0,5.0,286.0,0.0,31.0,1.0,415.0,64.0,4.0,0.0,...,6.0,1.0,5.0,14.0,6.0,1.0,35.0,3.0,0.0,33.0
alfredo,0.0,2.0,0.0,56.0,0.0,0.0,14.0,0.0,0.0,12.0,...,13.0,1.0,2.0,50.0,2.0,11.0,0.0,6.0,1.0,87.0
allspice,94.0,44.0,31.0,0.0,486.0,28.0,861.0,34.0,12.0,4.0,...,63.0,11.0,103.0,612.0,18.0,16.0,526.0,55.0,1.0,479.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow squash,8.0,3.0,1.0,11.0,16.0,0.0,47.0,0.0,0.0,43.0,...,140.0,13.0,3.0,213.0,53.0,4.0,44.0,11.0,11.0,2427.0
yogurt,20.0,46.0,35.0,0.0,526.0,11.0,1195.0,39.0,16.0,16.0,...,92.0,16.0,35.0,442.0,56.0,44.0,1266.0,84.0,2.0,691.0
yukon gold potato,10.0,8.0,3.0,6.0,55.0,11.0,25.0,10.0,0.0,8.0,...,39.0,6.0,40.0,492.0,10.0,11.0,84.0,102.0,9.0,108.0
zesty italian dressing,1.0,4.0,0.0,1.0,1.0,0.0,10.0,0.0,0.0,1.0,...,8.0,2.0,3.0,12.0,22.0,11.0,2.0,9.0,16.0,85.0


run a quick estimate through the pairwise system to see how things look

In [6]:
mf.recommend_ingredients_pairwise('agave nectar', pair_counts)

['vanilla', 'coconut', 'cinnamon', 'egg', 'almond']

Create our cosine similarity matrix, we are using this so ingredients that are underrepresented in our database are still there going to be returned in our final product.

In [7]:
sim_pd = mf.calculate_similarity_matrix(cooccurrence_df)
sim_pd

Unnamed: 0,acorn squash,adobo,agave nectar,alfredo,allspice,allspice berry,almond,almond milk,amaretto,american cheese,...,yellow bell pepper,yellow corn,yellow mustard,yellow onion,yellow pepper,yellow squash,yogurt,yukon gold potato,zesty italian dressing,zucchini
acorn squash,1.0,0.744209,0.540016,0.635689,0.769578,0.712248,0.499702,0.455304,0.255527,0.663565,...,0.721735,0.575941,0.711208,0.815414,0.726399,0.800319,0.799187,0.878897,0.552551,0.866625
adobo,0.744209,1.0,0.468725,0.588941,0.556575,0.601306,0.381168,0.275082,0.163075,0.64103,...,0.785333,0.590537,0.72012,0.812409,0.754515,0.747791,0.773211,0.789514,0.676702,0.787814
agave nectar,0.540016,0.468725,1.0,0.317279,0.664001,0.468007,0.783807,0.871487,0.628965,0.324905,...,0.443225,0.453812,0.37488,0.484954,0.430772,0.417243,0.704245,0.458637,0.320832,0.586262
alfredo,0.635689,0.588941,0.317279,1.0,0.409239,0.404986,0.368298,0.22067,0.189879,0.591571,...,0.635611,0.524478,0.534964,0.681652,0.649729,0.700477,0.587151,0.690301,0.702206,0.763547
allspice,0.769578,0.556575,0.664001,0.409239,1.0,0.802473,0.677057,0.631316,0.478366,0.489801,...,0.478854,0.578114,0.531017,0.600413,0.489591,0.532894,0.774257,0.591575,0.404165,0.708632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow squash,0.800319,0.747791,0.417243,0.700477,0.532894,0.52188,0.456078,0.285971,0.227099,0.769053,...,0.766991,0.669115,0.756272,0.748811,0.777525,1.0,0.724123,0.847315,0.654127,0.882374
yogurt,0.799187,0.773211,0.704245,0.587151,0.774257,0.67759,0.699825,0.572797,0.457868,0.651178,...,0.696751,0.691386,0.708076,0.776999,0.69587,0.724123,1.0,0.784429,0.613194,0.833546
yukon gold potato,0.878897,0.789514,0.458637,0.690301,0.591575,0.634531,0.461528,0.297825,0.242969,0.705274,...,0.782612,0.653935,0.747422,0.912525,0.775725,0.847315,0.784429,1.0,0.566642,0.877198
zesty italian dressing,0.552551,0.676702,0.320832,0.702206,0.404165,0.448439,0.33479,0.196439,0.146174,0.594222,...,0.609401,0.484091,0.553607,0.578677,0.739956,0.654127,0.613194,0.566642,1.0,0.683783


Test out our recommendations

In [13]:
reccomendations, error_codes = mf.recommend_items(sim_matrix=sim_pd, near_items="Squirrel", far_items="", top_n=20)

In [16]:
list(reccomendations.index)

['chicken thigh',
 'basmati rice',
 'green peas',
 'cashew nut',
 'chicken',
 'long grain rice',
 'cayenne',
 'vegetables',
 'cayenne pepper',
 'cashew',
 'red lentil',
 'chicken leg',
 'coriander',
 'red chili',
 'turmeric',
 'rice',
 'chicken breast',
 'peas',
 'carrot',
 'paprika']

In [17]:
sim_pd.to_csv('../data/similarity_matrix.csv')


In [18]:
mf.recommend_items(sim_matrix=sim_pd, near_items="apple", far_items="blackberry")

(tart apples            0.30899
 allspice              0.306519
 apple cider           0.289216
 granny smith apple    0.258214
 pumpkin               0.256363
 dtype: object,
 [])