In [186]:
import re
import json
import time
import random

import spacy
import requests
from bs4 import BeautifulSoup

In [258]:
user_agent_list = [
   # Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    # Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

In [259]:
def generate_header():
    headers = dict()
    headers['User-Agent'] = random.choice(user_agent_list)
    headers['Accept'] = 'application/json, text/javascript'
    return headers
generate_header()

{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
 'Accept': 'application/json, text/javascript'}

In [260]:
# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [261]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

In [262]:
def nouns_only(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if not (pair[1] == "NN" or pair[1] == "NNS"):
            return False
    return True

In [263]:
type_exceptions = ['can', 'tablespoon']

for exception in type_exceptions:
    print(tokenize(exception), spacy.explain(tokenize(exception)[0][1]))

[('can', 'MD')] verb, modal auxiliary
[('tablespoon', 'VB')] verb, base form


In [326]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length and 'abc' does not contain brackets
    pattern = re.compile(r'\([^\(\)]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        if line.split()[1] not in match[0]:
            return None
        return match
    
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r'\b, [^\(\)]*')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match
    
def extract_quantity_measurement_preparation(line):
    type_exceptions = ['can', 'tablespoon']
    quantity_split = []
    measurement = None
    
    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
#         print('preparation[0]:', preparation[0])
#         print(line)
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:]  
    
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    line_split = line.split()
    # extract quantity from the first word if the word contains a digit
    if any(char.isdigit() for char in line_split[0]):
        quantity_split.append(line_split[0])
    
        # extract quantity from the second word if the word contains a digit
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            # check measurement type
            # to avoid case like '1 large tomato, seeded and chopped'
            if nouns_only(line_split[2]) or line_split[2] in type_exceptions:
                measurement = line_split[2]
        else:
            # check line_split length for case like '1 egg' or '1/2 onion, chopped'
            if len(line_split) > 2 and (nouns_only(line_split[1]) or line_split[1] in type_exceptions):
                measurement = line_split[1]

    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)
    return ' '.join(quantity_split), measurement, preparation

In [327]:
# test on tricky cases
# extract_quantity_measurement_preparation('ground cayenne pepper to taste')
# extract_quantity_measurement_preparation('4 roma (plum) tomatoes, chopped')
extract_quantity_measurement_preparation('1 1/2 pounds colossal shrimp, EZ-peel type (deveined and shells split down the back)')
# extract_quantity_measurement_preparation('1 clove garlic, minced, or more to taste')
# extract_quantity_measurement_preparation('4 pounds Korean-style short ribs (beef chuck flanken, cut 1/3 to 1/2 inch thick across bones)')
# extract_quantity_measurement_preparation('1/2 (14 ounce) package vegetarian ground beef (e.g., Gimme Lean TM)')

('1 1/2', 'pounds', 'EZ-peel type ')

In [328]:
def extract_descriptor(ingredient_name):
    type_exceptions = ['parsley', 'garlic', 'chili']
    descriptor = []
    token_tag_pairs = []
    
    for element in ingredient_name.split():
        # treat compound word with hyphen as an adjective
        if '-' in element:
            token_tag_pairs.append((element, 'JJ'))
        else:
            token_tag_pairs.append([(token.text, token.tag_) for token in nlp(element)][0])
    
    for pair in token_tag_pairs:
        # if the word is an adjective, an adverb, or a past participle of a verb, or exception like 'ground'
        if pair[1] == "JJ" or pair[1] == "RB" or pair[1] == "VBN" or pair[0] == 'ground':
            if pair[0] not in type_exceptions:
                descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [329]:
# test
# extract_descriptor('extra-firm tofu') 
# extract_descriptor('all-purpose flour') 
# extract_descriptor('freshly ground black pepper') 
extract_descriptor('1/2 cup chopped parsley')
extract_descriptor('1 1/2 pounds colossal shrimp, EZ-peel type (deveined and shells split down the back)')

'colossal EZ-peel split back'

In [330]:
def extract_ingredient_name(line):
    line = re.sub(r'[ ]?®', '', line)
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    if measurement is None:
        measurement = ''
    if preparation:
        return line[len(quantity + ' ' + measurement) : -(len(preparation) + 2)].strip()
    else:
        return line[len(quantity + ' ' + measurement):].strip()

In [331]:
# test
extract_ingredient_name('1 (10 ounce) can coconut milk')

'coconut milk'

In [332]:
def extract_ingredients_nouns(line):
    ingredients_nouns = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        if pair[1] == 'NN' or pair[1] == 'NNS':
            ingredients_nouns |= {pair[0]}
    return ingredients_nouns

In [333]:
def get_ingredient_list(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # extract ingredients section from the webpage
    ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
    
    # remove unnecessary elements
    ingredients.remove('')
    ingredients.remove('Add all ingredients to list')
    return ingredients

In [334]:
def lemmatize(line):
    if len(line.split()) == 1:
        return [token.lemma_ for token in nlp(line)][0]
    return line

In [335]:
def get_ingredients_from_url(url):
    ingredients_url = set()
    ingredients_measurement_dict_url = {}
    ingredient_list = get_ingredient_list(url)
    
    for line in ingredient_list:
        quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
        # exceptions like "topping:"
        if ':' in line:
            print(line)
            print(url)
            continue
            
        ingredient_name = extract_ingredient_name(line)
        descriptor = extract_descriptor(ingredient_name)
        # remove descriptor if not None
        ingredient = ingredient_name.replace(descriptor, '').strip() if descriptor else ingredient_name
        # if ingredient is empty after removing descriptor
        if ingredient == '':
            ingredient = ingredient_name
        ingredient = ' '.join(ingredient.split())
        # lemmatize ingredient
#         ingredient = lemmatize(ingredient)
        
        if ingredient not in ingredients_measurement_dict_url:
            ingredients_measurement_dict_url[ingredient] = measurement
        ingredients_url |= {ingredient}
        
        # test
#         print(line)
#         print('quantity:', quantity)
#         print('measurement:', measurement)
#         print('ingredient name:', ingredient_name)
#         print('descriptor:', descriptor)
#         print('ingredient without descriptor:', ingredient)
#         print('preparation:', preparation)
#         print()
    
    return ingredients_url, ingredients_measurement_dict_url

In [336]:
def get_recipe_urls(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    recipe_cards = soup.find_all('div', class_='fixed-recipe-card__info')
    recipe_urls = []
    for recipe_card in recipe_cards:
        url = recipe_card.find('a')['href']
        recipe_urls.append(url)
    return recipe_urls

In [337]:
def get_all_ingredients_in_category(category_url):
    ingredients_freq_dict_all = dict()
    ingredients_measurement_dict_all = dict()
    recipe_urls = get_recipe_urls(category_url)
    for recipe_url in recipe_urls:
        recipe_ingredients, recipe_ingredients_measurement_dict = get_ingredients_from_url(recipe_url)
        for recipe_ingredient in recipe_ingredients:
            if recipe_ingredient in ingredients_freq_dict_all:
                ingredients_freq_dict_all[recipe_ingredient] += 1
            else:
                ingredients_freq_dict_all[recipe_ingredient] = 1
        ingredients_measurement_dict_all = {**ingredients_measurement_dict_all, **recipe_ingredients_measurement_dict}
        # scraping pause to avoid being banned by site
        time.sleep(5)
    return ingredients_freq_dict_all, ingredients_measurement_dict_all

In [294]:
# 3 pages of vegetarian protein recipes
# 1 pages of meat recipes
# 1 pages of seafood recipes

In [295]:
%%time
vegetarian_protein_ingredients_freq_dict = dict()
vegetarian_protein_ingredients_measurement_dict = dict()
for page in range(3):
    vegetarian_protein_url = 'https://www.allrecipes.com/recipes/16778/everyday-cooking/vegetarian/protein/?page=' + str(page+1)
    ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(vegetarian_protein_url)
    for i in ingredients_cat.keys():
        if i in vegetarian_protein_ingredients_freq_dict:
            vegetarian_protein_ingredients_freq_dict[i] += ingredients_cat[i]
        else:
            vegetarian_protein_ingredients_freq_dict[i] = ingredients_cat[i]
    vegetarian_protein_ingredients_measurement_dict = {**vegetarian_protein_ingredients_measurement_dict, **ingredients_measurement_dict_cat}

Bottom layer:
https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/
Mashed potato layer:
https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/
CPU times: user 1min 58s, sys: 15 s, total: 2min 13s
Wall time: 6min 52s


In [296]:
sorted(vegetarian_protein_ingredients_freq_dict.items(), key=lambda pair: pair[1], reverse=True)

[('tofu', 49),
 ('onion', 32),
 ('garlic', 27),
 ('soy sauce', 20),
 ('oil', 19),
 ('pepper', 19),
 ('salt', 17),
 ('bell pepper', 14),
 ('water', 14),
 ('mushrooms', 12),
 ('olive oil', 11),
 ('onions', 11),
 ('pepper to taste', 9),
 ('cilantro', 9),
 ('tomato sauce', 9),
 ('sugar', 9),
 ('tempeh', 8),
 ('cumin', 8),
 ('carrots', 8),
 ('alt to taste', 8),
 ('ginger root', 7),
 ('sesame oil', 6),
 ('garlic powder', 6),
 ('spinach', 6),
 ('broth', 6),
 ('basil', 6),
 ('mozzarella cheese', 6),
 ('curry powder', 6),
 ('coconut milk', 6),
 ('tomatoes', 6),
 ('butter', 6),
 ('celery', 5),
 ('ginger', 5),
 ('oregano', 5),
 ('cornstarch', 5),
 ('parsley', 5),
 ('cayenne pepper', 5),
 ('alt and pepper to taste', 5),
 ('zucchini', 4),
 ('soy milk', 4),
 ('potatoes', 4),
 ('peas', 4),
 ('clove garlic', 4),
 ('Cheddar cheese', 4),
 ('peppers', 4),
 ('lime juice', 4),
 ('Parmesan cheese', 4),
 ('vinegar', 4),
 ('peanut oil', 4),
 ('pepper flakes', 4),
 ('eggs', 4),
 ('chili powder', 4),
 ('rice', 

In [297]:
vegetarian_protein_ingredients_measurement_dict

{'bell pepper': 'cups',
 'oil': 'tablespoons',
 'onion': 'tablespoon',
 'barbecue sauce': 'cup',
 'rolls': 'kaiser',
 'tempeh': 'package',
 'pepper to taste': None,
 'sesame oil': 'tablespoons',
 'mushrooms': 'can',
 'pepper': 'jalapeno',
 'zucchini': None,
 'sriracha chili garlic sauce': 'tablespoons',
 'tofu': 'package',
 'soy sauce': 'tablespoons',
 'seitan': 'package',
 'coriander': 'teaspoons',
 'garlic powder': 'teaspoon',
 'tamari': 'tablespoon',
 'cumin': 'teaspoons',
 'yeast': 'tablespoons',
 'salt': 'pinch',
 'carrots': 'cups',
 'seasoning': 'teaspoon',
 'potatoes': 'cups',
 'vegetarian ground beef substitute': 'package',
 'vegan cream cheese (such as Tofutti)': 'tablespoons',
 'celery': 'cups',
 'clove garlic': None,
 'soy milk': 'cup',
 'peas': 'cup',
 'olive oil': 'tablespoons',
 'tomato': 'slices',
 'vegan mayonnaise': 'cup',
 'soy cheese': 'cup',
 'mirin (sweetened rice wine)': 'tablespoon',
 'ginger': 'teaspoon',
 'rice vinegar': 'tablespoons',
 'garlic': 'cloves',
 'ci

In [298]:
%%time
meat_ingredients_freq_dict = dict()
meat_ingredients_measurement_dict = dict()
meat_url = 'https://www.allrecipes.com/recipes/{}/meat-and-poultry/'.format(200)
ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(meat_url)
for i in ingredients_cat.keys():
    if i in meat_ingredients:
        meat_ingredients_freq_dict[i] += ingredients_cat[i]
    else:
        meat_ingredients_freq_dict[i] = ingredients_cat[i]
meat_ingredients_measurement_dict = {**meat_ingredients_measurement_dict, **ingredients_measurement_dict}
    


Brown Sugar Glaze:
https://www.allrecipes.com/recipe/232247/tennessee-meatloaf/
Meatloaf:
https://www.allrecipes.com/recipe/232247/tennessee-meatloaf/
CPU times: user 56.7 s, sys: 6.71 s, total: 1min 3s
Wall time: 2min 59s


In [346]:
sorted(meat_ingredients_freq_dict.items(), key=lambda pair: pair[1], reverse=True)

[('onion', 22),
 ('pepper', 18),
 ('beef', 17),
 ('salt', 16),
 ('garlic', 12),
 ('sugar', 10),
 ('water', 9),
 ('Worcestershire sauce', 8),
 ('oregano', 6),
 ('bell pepper', 6),
 ('parsley', 6),
 ('ketchup', 6),
 ('oil', 5),
 ('tomato sauce', 5),
 ('tomatoes', 5),
 ('garlic powder', 5),
 ('milk', 5),
 ('egg', 5),
 ('bread crumbs', 5),
 ('soy sauce', 4),
 ('beef broth', 4),
 ('cumin', 4),
 ('olive oil', 4),
 ('eggs', 4),
 ('flour', 4),
 ('Parmesan cheese', 4),
 ('cornstarch', 3),
 ('basil', 3),
 ('chili powder', 3),
 ('leaf', 3),
 ('alt and pepper to taste', 3),
 ('mustard', 3),
 ('potatoes', 3),
 ('cayenne pepper', 3),
 ('paprika', 3),
 ('tomato paste', 3),
 ('beef bouillon', 3),
 ('celery', 3),
 ('ginger root', 2),
 ('pepper sauce (such as Tabasco)', 2),
 ('rolls', 2),
 ('pepper flakes', 2),
 ('thyme', 2),
 ('dried rosemary', 2),
 ('olives', 2),
 ('onion powder', 2),
 ('mozzarella cheese', 2),
 ('seasoning', 2),
 ('onion soup mix', 2),
 ('Cheddar cheese', 2),
 ('carrots', 2),
 ('onio

In [299]:
meat_ingredients_measurement_dict

{'olive oil': 'tablespoon',
 'alt and pepper to taste': None,
 'flour': 'cup',
 'butter': 'cup',
 'tilapia': 'fillets',
 'salt': 'teaspoons',
 'lemon': None,
 'bread crumbs': 'cup',
 'cayenne pepper': 'teaspoon',
 'pepper': 'teaspoons',
 'egg': None,
 'salmon': 'fillets',
 'cracker': None,
 'caper': 'teaspoon',
 'large shrimp - peeled and deveined': 'pounds',
 'shallot': 'tablespoon',
 'garlic': 'cloves',
 'paprika': 'teaspoon',
 'chive': 'tablespoons',
 'clove garlic': None,
 'halibut': 'fillets',
 'parsley': 'tablespoons',
 'lemon juice': 'tablespoon',
 'basil': 'tablespoon',
 'thyme': 'teaspoon',
 'mussel': 'quarts',
 'leaf': 'bay',
 'onion': 'cups',
 'wine': 'cup',
 'ea salt to taste': None,
 'salmon fillets': 'pounds',
 'miso paste': 'tablespoon',
 'mayonnaise': 'cup',
 'Dijon mustard': 'teaspoon',
 'fontina cheese': 'ounces',
 'mozzarella cheese': 'ounces',
 'milk': 'cup',
 'beer': 'cup',
 'Monterey Jack cheese': 'ounces',
 'fresh': None,
 'spinach': 'package',
 'salt to taste': 

In [342]:
%%time
seafood_ingredients_freq_dict = dict()
seafood_ingredients_measurement_dict = dict()
for page in range(1):
    seafood_url = 'https://www.allrecipes.com/recipes/93/seafood/' + str(page+1)
    ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(seafood_url)
    for i in ingredients_cat.keys():
        if i in seafood_ingredients_freq_dict:
            seafood_ingredients_freq_dict[i] += ingredients_cat[i]
        else:
            seafood_ingredients_freq_dict[i] = ingredients_cat[i]
    seafood_ingredients_measurement_dict = {**seafood_ingredients_measurement_dict, **ingredients_measurement_dict}

CPU times: user 45.9 s, sys: 6.03 s, total: 51.9 s
Wall time: 2min 51s


In [349]:
sorted(seafood_ingredients_freq_dict.items(), key=lambda pair: pair[1], reverse=True)

[('butter', 12),
 ('pepper', 11),
 ('garlic', 11),
 ('olive oil', 10),
 ('parsley', 9),
 ('cayenne pepper', 7),
 ('salt', 7),
 ('lemon juice', 7),
 ('onion', 6),
 ('shrimp', 6),
 ('basil', 5),
 ('clove garlic', 5),
 ('soy sauce', 5),
 ('oil', 4),
 ('salmon', 4),
 ('alt and pepper to taste', 4),
 ('mayonnaise', 4),
 ('honey', 4),
 ('bread crumbs', 3),
 ('lemon', 3),
 ('Dijon mustard', 3),
 ('beer', 3),
 ('flour', 3),
 ('Parmesan cheese', 3),
 ('alt to taste', 3),
 ('tuna', 3),
 ('paprika', 2),
 ('Worcestershire sauce', 2),
 ('capers', 2),
 ('wine', 2),
 ('thyme', 2),
 ('salmon fillets', 2),
 ('oil for frying', 2),
 ('egg', 2),
 ('baking powder', 2),
 ('water', 2),
 ('wine vinegar', 2),
 ('cornstarch', 2),
 ('onions', 2),
 ('chicken broth', 2),
 ('oregano', 2),
 ('bell pepper', 2),
 ('celery', 2),
 ('dill weed', 2),
 ('Cheddar cheese', 2),
 ('fillets', 2),
 ('pepper to taste', 2),
 ('cream', 2),
 ('dashes sauce', 1),
 ('chicken stock', 1),
 ('emon', 1),
 ('colossal shrimp, EZ-peel type (

In [343]:
seafood_ingredients_measurement_dict

{'olive oil': 'tablespoon',
 'alt and pepper to taste': None,
 'flour': 'cup',
 'butter': 'cup',
 'tilapia': 'fillets',
 'salt': 'teaspoons',
 'lemon': None,
 'bread crumbs': 'cup',
 'cayenne pepper': 'teaspoon',
 'pepper': 'teaspoons',
 'egg': None,
 'salmon': 'fillets',
 'cracker': None,
 'caper': 'teaspoon',
 'large shrimp - peeled and deveined': 'pounds',
 'shallot': 'tablespoon',
 'garlic': 'cloves',
 'paprika': 'teaspoon',
 'chive': 'tablespoons',
 'clove garlic': None,
 'halibut': 'fillets',
 'parsley': 'tablespoons',
 'lemon juice': 'tablespoon',
 'basil': 'tablespoon',
 'thyme': 'teaspoon',
 'mussel': 'quarts',
 'leaf': 'bay',
 'onion': 'cups',
 'wine': 'cup',
 'ea salt to taste': None,
 'salmon fillets': 'pounds',
 'miso paste': 'tablespoon',
 'mayonnaise': 'cup',
 'Dijon mustard': 'teaspoon',
 'fontina cheese': 'ounces',
 'mozzarella cheese': 'ounces',
 'milk': 'cup',
 'beer': 'cup',
 'Monterey Jack cheese': 'ounces',
 'fresh': None,
 'spinach': 'package',
 'salt to taste': 

In [344]:
ingredients_categorized = {}
ingredients_categorized['vegetarian_protein'] = {}
ingredients_categorized['vegetarian_protein']['ingredients'] = vegetarian_protein_ingredients_freq_dict
ingredients_categorized['vegetarian_protein']['measurement'] = vegetarian_protein_ingredients_measurement_dict
ingredients_categorized['meat'] = {}
ingredients_categorized['meat']['ingredients'] = meat_ingredients_freq_dict
ingredients_categorized['meat']['measurement'] = meat_ingredients_measurement_dict
ingredients_categorized['seafood'] = {}
ingredients_categorized['seafood']['ingredients'] = seafood_ingredients_freq_dict
ingredients_categorized['seafood']['measurement'] = seafood_ingredients_measurement_dict

In [345]:
with open('ingredients_categorized.json', 'w') as file:
    json.dump(ingredients_categorized, file)
file.closed

True

In [357]:
# check
with open('ingredients_categorized.json', 'r') as file:
    ingredients_categorized = json.load(file)
file.closed

ingredients_categorized['meat']['ingredients'].keys()

dict_keys(['onion', 'cornstarch', 'pepper', 'oil', 'bonnet pepper', 'fava beans', 'sprig thyme', 'salt', 'soy sauce', 'ginger root', 'allspice berries', 'water', 'garlic', 'beef oxtail', 'tomato sauce', 'basil', 'tomatoes', 'garlic powder', 'oregano', 'chili powder', 'bell pepper', 'beef', 'pepper sauce (such as Tabasco)', 'Worcestershire sauce', 'beef broth', 'elbow macaroni', 'leaf', 'vinegar', 'parsley', 'chicken broth', 'alt and pepper to taste', 'chopped giardiniera (pickled Italian vegetables)', 'boneless beef chuck', 'rolls', 'pepper flakes', 'thyme', 'dried rosemary', 'plum wine', 'Korean-style short ribs (beef chuck flanken,', '(toasted) sesame oil', 'honey', 'sugar', 'cumin', 'olive oil', 'cooking wine', 'olives', 'sazon seasoning (such as Badia )', 'sauce (such as Louisiana)', 'onion powder', 'small butternut squash', 'bay leaf', 'eggs', 'ketchup', 'cider vinegar', 'ooking spray', 'cooking oats', 'milk', 'ground veal', 'pork', 'mustard', 'garam masala', 'coriander', 'water (