In [1]:
import re
import json
import time

import spacy
import requests
from bs4 import BeautifulSoup

In [2]:
# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [3]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

In [4]:
def nouns_only(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if not (pair[1] == "NN" or pair[1] == "NNS"):
            return False
    return True

In [5]:
type_exceptions = ['can', 'tablespoon']

for exception in type_exceptions:
    print(tokenize(exception), spacy.explain(tokenize(exception)[0][1]))

[('can', 'MD')] verb, modal auxiliary
[('tablespoon', 'VB')] verb, base form


In [150]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length and 'abc' does not contain brackets
    pattern = re.compile(r'\([^\(\)]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        if line.split()[1] not in match[0]:
            return None
        return match
    
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r'\b, [\s\S]*')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match
    
def extract_quantity_measurement_preparation(line):
    type_exceptions = ['can', 'tablespoon']
    quantity_split = []
    measurement = None
    
    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:]  
    
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    line_split = line.split()
    # extract quantity from the first word if the word contains a digit
    if any(char.isdigit() for char in line_split[0]):
        quantity_split.append(line_split[0])
    
        # extract quantity from the second word if the word contains a digit
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            # check measurement type
            # to avoid case like '1 large tomato, seeded and chopped'
            if nouns_only(line_split[2]) or line_split[2] in type_exceptions:
                measurement = line_split[2]
        else:
            # check line_split length for case like '1 egg' or '1/2 onion, chopped'
            if len(line_split) > 2 and (nouns_only(line_split[1]) or line_split[1] in type_exceptions):
                measurement = line_split[1]

    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)
    return ' '.join(quantity_split), measurement, preparation

In [144]:
# test on tricky cases
# extract_quantity_measurement_preparation('ground cayenne pepper to taste')
# extract_quantity_measurement_preparation('4 roma (plum) tomatoes, chopped')
# extract_quantity_measurement_preparation('3 tablespoons vegan cream cheese substitute (such as Tofutti ®)')
# extract_quantity_measurement_preparation('1 clove garlic, minced, or more to taste')
# extract_quantity_measurement_preparation('1 (8 ounce) container extra firm tofu, drained and sliced into large chunks')
extract_quantity_measurement_preparation('1/2 (14 ounce) package vegetarian ground beef (e.g., Gimme Lean TM)')

('1/2 (14 ounce)', 'package', None)

In [115]:
def extract_descriptor(ingredient_name):
    type_exceptions = ['parsley', 'garlic', 'chili']
    descriptor = []
    token_tag_pairs = []
    
    for element in ingredient_name.split():
        # treat compound word with hyphen as an adjective
        if '-' in element:
            token_tag_pairs.append((element, 'JJ'))
        else:
            token_tag_pairs.append([(token.text, token.tag_) for token in nlp(element)][0])
    
    for pair in token_tag_pairs:
        # if the word is an adjective, an adverb, or a past participle of a verb, or exception like 'ground'
        if pair[1] == "JJ" or pair[1] == "RB" or pair[1] == "VBN" or pair[0] == 'ground':
            if pair[0] not in type_exceptions:
                descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [116]:
# test
# extract_descriptor('extra-firm tofu') 
# extract_descriptor('all-purpose flour') 
# extract_descriptor('freshly ground black pepper') 
extract_descriptor('1/2 cup chopped parsley')

'chopped'

In [117]:
def extract_ingredient_name(line):
    line = re.sub(r'[ ]?®', '', line)
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    if measurement is None:
        measurement = ''
    if preparation:
        return line[len(quantity + ' ' + measurement) : -(len(preparation) + 2)].strip()
    else:
        return line[len(quantity + ' ' + measurement):].strip()

In [118]:
# test
extract_ingredient_name('1 (10 ounce) can coconut milk')

'can coconut milk'

In [119]:
def extract_ingredients_nouns(line):
    ingredients_nouns = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        if pair[1] == 'NN' or pair[1] == 'NNS':
            ingredients_nouns |= {pair[0]}
    return ingredients_nouns

In [120]:
def get_ingredient_list(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # extract ingredients section from the webpage
    ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
    
    # remove unnecessary elements
    ingredients.remove('')
    ingredients.remove('Add all ingredients to list')
    return ingredients

In [121]:
def lemmatize(line):
    if len(line.split()) == 1:
        return [token.lemma_ for token in nlp(line)][0]
    return line

In [158]:
def get_ingredients_from_url(url):
    ingredients_url = set()
    ingredients_measurement_dict_url = {}
    ingredient_list = get_ingredient_list(url)
    
    for line in ingredient_list:
        quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
        # exceptions like "topping:"
        if ':' in line:
            print(line)
            print(url)
            continue
            
        ingredient_name = extract_ingredient_name(line)
        descriptor = extract_descriptor(ingredient_name)
        # remove descriptor if not None
        ingredient = ingredient_name.replace(descriptor, '').strip() if descriptor else ingredient_name
        # if ingredient is empty after removing descriptor
        if ingredient == '':
            ingredient = ingredient_name
        ingredient = ' '.join(ingredient.split())
        # lemmatize ingredient
#         ingredient = lemmatize(ingredient)
        
        if ingredient not in ingredients_measurement_dict_url:
            ingredients_measurement_dict_url[ingredient] = measurement
        ingredients_url |= {ingredient}
        
        # test
#         print(line)
#         print('quantity:', quantity)
#         print('measurement:', measurement)
#         print('ingredient name:', ingredient_name)
#         print('descriptor:', descriptor)
#         print('ingredient without descriptor:', ingredient)
#         print('preparation:', preparation)
#         print()
    
    return ingredients_url, ingredients_measurement_dict_url

In [159]:
def get_recipe_urls(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    recipe_cards = soup.find_all('div', class_='fixed-recipe-card__info')
    recipe_urls = []
    for recipe_card in recipe_cards:
        url = recipe_card.find('a')['href']
        recipe_urls.append(url)
    return recipe_urls

In [168]:
def get_all_ingredients_in_category(category_url):
    ingredients_freq_dict_all = dict()
    ingredients_measurement_dict_all = dict()
    recipe_urls = get_recipe_urls(category_url)
    for recipe_url in recipe_urls:
        recipe_ingredients, recipe_ingredients_measurement_dict = get_ingredients_from_url(recipe_url)
        for recipe_ingredient in recipe_ingredients:
            if recipe_ingredient in ingredients_freq_dict_all:
                ingredients_freq_dict_all[recipe_ingredient] += 1
            else:
                ingredients_freq_dict_all[recipe_ingredient] = 1
        ingredients_measurement_dict_all = {**ingredients_measurement_dict_all, **recipe_ingredients_measurement_dict}
        # scraping pause to avoid being banned by site
        time.sleep(3)
    return ingredients_freq_dict_all, ingredients_measurement_dict_all

In [161]:
# 3 pages of vegetarian protein recipes
# 1 pages of meat recipes
# 1 pages of seafood recipes

In [169]:
%%time
vegetarian_protein_ingredients_freq_dict = dict()
vegetarian_protein_ingredients_measurement_dict = dict()
for page in range(3):
    vegetarian_protein_url = 'https://www.allrecipes.com/recipes/16778/everyday-cooking/vegetarian/protein/?page=' + str(page+1)
    ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(vegetarian_protein_url)
    for i in ingredients_cat.keys():
        if i in vegetarian_protein_ingredients_freq_dict:
            vegetarian_protein_ingredients_freq_dict[i] += ingredients_cat[i]
        else:
            vegetarian_protein_ingredients_freq_dict[i] = ingredients_cat[i]
    vegetarian_protein_ingredients_measurement_dict = {**vegetarian_protein_ingredients_measurement_dict, **ingredients_measurement_dict_cat}
    
vegetarian_protein_ingredients_freq_dict

Bottom layer:
https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/
Mashed potato layer:
https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/
CPU times: user 2min, sys: 16.1 s, total: 2min 16s
Wall time: 4min 16s


In [170]:
vegetarian_protein_ingredients_measurement_dict

{'Worcestershire sauce': 'tablespoon',
 'chili powder': 'teaspoons',
 'salt': 'pinch',
 'mustard': 'tablespoons',
 'water': 'cups',
 'tomato sauce': 'can',
 'onion': 'tablespoon',
 'smoke flavoring': 'teaspoon',
 'pepper': 'jalapeno',
 'burger crumbles': 'package',
 'pepper flakes': 'teaspoon',
 'garlic': 'cloves',
 'bell pepper': 'cups',
 'pepper to taste': None,
 'sesame oil': 'tablespoons',
 'mushrooms': 'can',
 'zucchini': None,
 'sriracha chili garlic sauce': 'tablespoons',
 'tofu': 'package',
 'soy sauce': 'tablespoons',
 'seitan': 'package',
 'oil': 'tablespoons',
 'coriander': 'teaspoons',
 'garlic powder': 'teaspoon',
 'tamari': 'tablespoon',
 'cumin': 'teaspoons',
 'yeast': 'tablespoons',
 'carrots': 'cups',
 'seasoning': 'teaspoon',
 'potatoes': 'cups',
 'vegetarian ground beef substitute': 'package',
 'vegan cream cheese (such as Tofutti)': 'tablespoons',
 'celery': 'cups',
 'clove garlic': None,
 'soy milk': 'cup',
 'peas': 'cup',
 'olive oil': 'tablespoons',
 'tomato': 's

In [171]:
%%time
meat_ingredients_freq_dict = dict()
meat_ingredients_measurement_dict = dict()
for number in range(200):
    meat_url = 'https://www.allrecipes.com/recipes/{}/meat-and-poultry/'.format(number)
    ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(meat_url)
    for i in ingredients_cat.keys():
        if i in meat_ingredients:
            meat_ingredients_freq_dict[i] += ingredients_cat[i]
        else:
            meat_ingredients_freq_dict[i] = ingredients_cat[i]
    meat_ingredients_measurement_dict = {**meat_ingredients_measurement_dict, **ingredients_measurement_dict}
    
meat_ingredients_freq_dict

Frosting:
https://www.allrecipes.com/recipe/10549/best-brownies/


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [172]:
meat_ingredients_measurement_dict

{'olive oil': 'tablespoon',
 'alt and pepper to taste': None,
 'flour': 'cup',
 'butter': 'cup',
 'tilapia': 'fillets',
 'salt': 'teaspoons',
 'lemon': None,
 'bread crumbs': 'cup',
 'cayenne pepper': 'teaspoon',
 'pepper': 'teaspoons',
 'egg': None,
 'salmon': 'fillets',
 'cracker': None,
 'caper': 'teaspoon',
 'large shrimp - peeled and deveined': 'pounds',
 'shallot': 'tablespoon',
 'garlic': 'cloves',
 'paprika': 'teaspoon',
 'chive': 'tablespoons',
 'clove garlic': None,
 'halibut': 'fillets',
 'parsley': 'tablespoons',
 'lemon juice': 'tablespoon',
 'basil': 'tablespoon',
 'thyme': 'teaspoon',
 'mussel': 'quarts',
 'leaf': 'bay',
 'onion': 'cups',
 'wine': 'cup',
 'ea salt to taste': None,
 'salmon fillets': 'pounds',
 'miso paste': 'tablespoon',
 'mayonnaise': 'cup',
 'Dijon mustard': 'teaspoon',
 'fontina cheese': 'ounces',
 'mozzarella cheese': 'ounces',
 'milk': 'cup',
 'beer': 'cup',
 'Monterey Jack cheese': 'ounces',
 'fresh': None,
 'spinach': 'package',
 'salt to taste': 

In [173]:
%%time
seafood_ingredients_freq_dict = set()
seafood_ingredients_measurement_dict = dict()
for page in range(2):
    seafood_url = 'https://www.allrecipes.com/recipes/93/seafood/' + str(page+1)
    ingredients_cat, ingredients_measurement_dict_cat = get_all_ingredients_in_category(seafood_url)
    for i in ingredients_cat.keys():
        if i in seafood_ingredients_freq_dict:
            seafood_ingredients_freq_dict[i] += ingredients_cat[i]
        else:
            seafood_ingredients_freq_dict[i] = ingredients_cat[i]
    seafood_ingredients_measurement_dict = {**seafood_ingredients_measurement_dict, **ingredients_measurement_dict}
    
seafood_ingredients_freq_dict

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [174]:
seafood_ingredients_measurement_dict

{}

In [175]:
ingredients_categorized = {}
ingredients_categorized['vegetarian_protein'] = {}
ingredients_categorized['vegetarian_protein']['ingredients'] = vegetarian_protein_ingredients_freq_dict
ingredients_categorized['vegetarian_protein']['measurement'] = vegetarian_protein_ingredients_measurement_dict
ingredients_categorized['meat'] = {}
ingredients_categorized['meat']['ingredients'] = meat_ingredients_freq_dict
ingredients_categorized['meat']['measurement'] = meat_ingredients_measurement_dict
ingredients_categorized['seafood'] = {}
ingredients_categorized['seafood']['ingredients'] = seafood_ingredients_freq_dict
ingredients_categorized['seafood']['measurement'] = seafood_ingredients_measurement_dict

In [176]:
with open('ingredients_categorized.json', 'r') as file:
    ingredients_categorized = json.load(file)
file.closed

FileNotFoundError: [Errno 2] No such file or directory: 'ingredients_categorized.json'