# 1. Import modules and define functions

In [3]:
import json
import pickle
import string
from collections import Counter
from gensim.models import word2vec

# This loads in named ingredients, which is used to parse
with open( 'data/named_ingredients.pkl', 'rb' ) as f:
    ingred_set = pickle.load( f )

def parse_ingredients( ingred_line: str ) -> str:
    ingred_line = (  ingred_line.lower()
                                .translate(
                                    str.maketrans( '', '', string.punctuation )
                                ) # Removes punctuation
                                .split()
                                
    )
    ingred_list = []
    for i in range(3,0,-1): # Make a list of all possible trigrams, bigrams, and unigrams
        ingred_list += [ ' '.join(ingred_line[ii:ii+i]) for ii in range(len(ingred_line)-(i-1)) ]
    for ingred_str in ingred_list:
        if ingred_str in ingred_set: # We take first named ingredient found while searching trigrams > bigrams > unigrams
            return ingred_str
    return ''

def recipe_iter( json_object: list ) -> list:
    counter = 0
    json_len = len(json_object)
    reduced = []
    for recipe in json_object:
        ingred = []
        for full_ingred in recipe['ingredients']:
            parsed = parse_ingredients( full_ingred['text'] )
            if parsed: # Ignore empty strings returned by parse_ingredients()
                ingred.append(parsed)
        if ingred:
            reduced.append( ingred )
        counter += 1
        if counter%10_000 == 0: # Every 10,000 recipes, save results and print an update
            with open( 'data/reduced_ingredients.pkl', 'wb' ) as f:
                pickle.dump( reduced, f )
            print( f'{counter} of {json_len} complete ({round(100*counter/json_len, 1)}%)', end='\r' )
    return reduced

# 2. Import data and clean up

In [4]:
with open( 'data/recipe1M_layer1.json' ) as f:
    json_object = json.load( f )
    ingredients = recipe_iter( json_object )

1020000 of 1029720 complete (99.1%)

# 3. Look at ingredients

In [5]:
Counter( [item for sublist in ingredients for item in sublist] ).most_common(20)

[('salt', 428774),
 ('sugar', 239667),
 ('butter', 220809),
 ('flour', 211277),
 ('olive oil', 174902),
 ('water', 161278),
 ('garlic', 153521),
 ('onion', 130858),
 ('milk', 121745),
 ('fresh', 95166),
 ('ground black pepper', 90000),
 ('egg', 89270),
 ('green', 81409),
 ('brown sugar', 75210),
 ('unsalted butter', 72394),
 ('pepper', 70327),
 ('vegetable oil', 70290),
 ('baking powder', 70098),
 ('lemon juice', 65018),
 ('vanilla extract', 61449)]

# 4. Setup word2vec model

In [6]:
num_features = 300                
min_word_count = 3                    
num_workers = -1
context = 10                                                                     
downsampling = 1e-3

# 4. Train the model

In [12]:
# Initialize and train the model 
model = (
    word2vec.Word2Vec(  ingredients,
                        vector_size=num_features,
                        min_count = min_word_count,
                        window = context,
                        sample = downsampling,
                        workers=num_workers)
)

# 5. Cache results
We'll store the 10 top most similiar ingredients for all named ingredients that exist in the corpus.

In [45]:
most_similar = {}
for ingred in ingred_set:
    if ingred in model.wv:
        similar = model.wv.most_similar( ingred )
        most_similar[ingred] = [ tup[0] for tup in similar ]
with open( 'ml_substitutions.pkl', 'wb' ) as f:
    pickle.dump( most_similar, f )

In [44]:
most_similar['egg']

['ranch dressing',
 'yolk',
 'crispy bacon',
 'light lager',
 'chinese rice vinegar',
 'turbot fillet',
 'croissant dough',
 'bicarbonate of soda',
 'chopped celery',
 'apricot jelly']

In [16]:
ingredients[0]

['penne',
 'cheese sauce',
 'cheddar',
 'gruyere cheese',
 'chili powder',
 'unsalted butter',
 'flour',
 'milk',
 'cheese',
 'cheese',
 'kosher salt',
 'chili powder',
 'garlic powder']

In [42]:
x = model.wv.most_similar('salt')
[ y[0] for y in x ]

['guacamole',
 'vanilla',
 'summer squash',
 'low sodium salt',
 'large egg',
 'cream sherry',
 'ciabatta',
 'char siu',
 'turkey broth',
 'gingerroot']

In [31]:
txt = json_object[2]['ingredients'][2]['text'].lower()
txt = txt.translate(
    str.maketrans( '', '', string.punctuation )
)
ingred_line = txt.split()
ingred_list = []
for i in range(3,0,-1):
    ingred_list += [ ' '.join(ingred_line[ii:ii+i]) for ii in range(len(ingred_line)-(i-1)) ]
for ingred_str in ingred_list:
    if ingred_str in ingred_set:
        print( ingred_str )

red
onion


In [18]:
ingred_list

['1 green bell',
 'green bell pepper,',
 'bell pepper, cut',
 'pepper, cut into',
 'cut into small',
 'into small dice',
 '1 green',
 'green bell',
 'bell pepper,',
 'pepper, cut',
 'cut into',
 'into small',
 'small dice',
 '1',
 'green',
 'bell',
 'pepper,',
 'cut',
 'into',
 'small',
 'dice']

In [None]:
ingred_list

In [11]:
ingredients[2]

['kosher salt', 'red', 'green', 'red', 'olive', 'fresh']

In [5]:
' '.join(['hello'])

'hello'

['1 c. elbow macaroni',
 '1 c. cubed American cheese (4 ounce.)',
 '1/2 c. sliced celery',
 '1/2 c. minced green pepper',
 '3 tbsp. minced pimento',
 '1/2 c. mayonnaise or possibly salad dressing',
 '1 tbsp. vinegar',
 '3/4 teaspoon salt',
 '1/2 teaspoon dry dill weed']