# Ingredients

In [1]:
import re
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'
url = 'https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/'

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
times = set([element.text.strip() for element in soup.find_all(class_='prepTime__item')])
# # remove uncessary elements
times.remove('')
times

{'Cook55 m', 'Prep20 m', 'Ready In1 h 15 m'}

In [5]:
for time in times:
    if 'prep' in time.lower():
        prep_time = time[4:]
    if 'cook' in time.lower():
        cook_time = time[4:]
print('prep_time:', prep_time)
print('cook_time:', cook_time)

prep_time: 20 m
cook_time: 55 m


In [6]:
def convert_to_minutes(time):
    if 'h' in cook_time:
        hour_index = cook_time.index('h')
        hours = int(cook_time[:hour_index].strip())
        minutes = int(cook_time[hour_index+1 : -1].strip())
    else:
        hours = 0
        minutes = int(cook_time[: -1].strip())
    return 60*hours + minutes

In [7]:
# extract ingredients section from the webpage
ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
# remove unnecessary elements
ingredients.remove('')
ingredients.remove('Add all ingredients to list')
ingredients

{'1 (14 ounce) package vegetarian ground beef substitute',
 '1 clove garlic, minced, or more to taste',
 '1 large yellow onion, chopped',
 '1 pinch ground black pepper to taste',
 '1 tablespoon vegetable oil',
 '1 teaspoon Italian seasoning',
 '1 tomato, chopped',
 '1/2 cup frozen peas',
 '1/2 cup shredded Cheddar-style soy cheese',
 '1/2 cup soy milk',
 '1/2 cup vegan mayonnaise',
 '1/4 cup olive oil',
 '2 carrots, chopped',
 '2 teaspoons salt',
 '3 stalks celery, chopped',
 '3 tablespoons vegan cream cheese substitute (such as Tofutti ®)',
 '5 russet potatoes, peeled and cut into 1-inch cubes',
 'Bottom layer:',
 'Mashed potato layer:'}

In [8]:
# extract directions section from the webpage
directions = [element.text.strip() for element in soup.find_all(class_='recipe-directions__list--item')]
# remove unnecessary elements
directions.remove('')
directions

['Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat. Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.',
 'Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy. Set the potatoes aside.',
 'Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.',
 'Heat the vegetable oil in a large skillet over medium heat, and cook and stir the onion, carrots, celery, frozen peas, and tomato until softened, about 10 minutes. Stir in the Italian seasoning, garlic, and pepper.',
 'Reduce the heat to medium-low, and crumble the vegetarian ground beef substitute into the skillet with the vegetables. Cook and stir, breaking up the meat substitute, until the mixture is hot, about 5 minutes.',
 'Spread the vegetarian meat substitute mixture into the bottom of the baking dish, and top with

In [9]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

def nouns_only(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if not (pair[1] == "NN" or pair[1] == "NNS"):
            return False
    return True

In [10]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length and 'abc' does not contain brackets
    pattern = re.compile(r'\([^\(\)]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        if line.split()[1] not in match[0]:
            return None
        return match
    
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r'\b, [^\(\)]*')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match
    
def extract_all(line):    
    type_exceptions = ['can', 'tablespoon', 'oz']
    quantity_split = []
    measurement = None
    
    # extract preparation
    line = line.replace(' -', ',')
    preparation = extract_preparation(line)
    if preparation:
#         print('preparation[0]:', preparation[0])
#         print(line)
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:].strip()
        
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    line_split = line.split()
    # extract quantity from the first word if the word contains a digit
    if any(char.isdigit() for char in line_split[0]):
        quantity_split.append(line_split[0])
    
        # extract quantity from the second word if the word contains a digit
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            # check measurement type
            # to avoid case like '1 large tomato, seeded and chopped'
            if nouns_only(line_split[2]) or line_split[2] in type_exceptions:
                measurement = line_split[2]
        else:
            # check line_split length for case like '1 egg' or '1/2 onion, chopped'
            if len(line_split) > 2 and (nouns_only(line_split[1]) or line_split[1] in type_exceptions):
                measurement = line_split[1]
        line = re.sub(r'{0}'.format(' '.join(quantity_split)), '', line)
    
    if measurement:
        line = re.sub(r'{0}'.format(measurement), '', line)
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)
    quantity = ' '.join(quantity_split)

    # extract ingredient name
    line = re.sub(r'[ ]?®', '', line)
    line = re.sub(r'[ ]?™', '', line)
    ingredient_name = line.strip()
    
    # if 'or to taske' or 'or as needed' in preparation
    if preparation is not None and 'or ' in preparation:
        quantity += ' ' + preparation
        preparation = None    
    
    return quantity, measurement, ingredient_name, preparation

In [11]:
def extract_descriptor(ingredient_name):
    type_exceptions = ['parsley', 'garlic', 'chili']
    descriptor = []
    token_tag_pairs = []
    
    for element in ingredient_name.split():
        # treat compound word with hyphen as an adjective
        if '-' in element:
            token_tag_pairs.append((element, 'JJ'))
        else:
            token_tag_pairs.append([(token.text, token.tag_) for token in nlp(element)][0])
    
    for pair in token_tag_pairs:
        # if the word is an adjective, an adverb, or a past participle of a verb, or exception like 'ground'
        if pair[1] == "JJ" or pair[1] == "RB" or pair[1] == "VBN" or pair[0] == 'ground':
            if pair[0] not in type_exceptions:
                descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [12]:
import spacy

# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [15]:
tokenize('1 tablespoon extra virgin olive oil')
tokenize('5 russet potatoes, peeled and cut into 1-inch cubes')

[('5', 'CD'),
 ('russet', 'NN'),
 ('potatoes', 'NNS'),
 (',', ','),
 ('peeled', 'VBD'),
 ('and', 'CC'),
 ('cut', 'VBN'),
 ('into', 'IN'),
 ('1-inch', 'JJ'),
 ('cubes', 'NNS')]

In [16]:
spacy.explain('VBZ')

'verb, 3rd person singular present'

In [17]:
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def tokenize_nltk(line):
    tokens = nltk.word_tokenize(line)
    token_tag_pairs = nltk.pos_tag(tokens)
    return token_tag_pairs

In [18]:
tokenize_nltk('1 tablespoon extra virgin olive oil')

[('1', 'CD'),
 ('tablespoon', 'NN'),
 ('extra', 'JJ'),
 ('virgin', 'NN'),
 ('olive', 'JJ'),
 ('oil', 'NN')]

In [19]:
def check_noun_num(token_tag_pairs):
    tag_num_dict = {}
    for pair in token_tag_pairs:
        if pair[1] not in tag_num_dict:
            tag_num_dict[pair[1]] = 1
        else:
            tag_num_dict[pair[1]] += 1
    
    criterion_1 = 'NN' in tag_num_dict and tag_num_dict['NN'] >= 2
    criterion_2 = 'NNS' in tag_num_dict and tag_num_dict['NNS'] >= 2
    criterion_3 = 'NN' in tag_num_dict and 'NNS' in tag_num_dict and tag_num_dict['NN'] + tag_num_dict['NNS'] >= 2
    
    if criterion_1 or criterion_2 or criterion_3:
        return True
    else:
        return False

In [20]:
ingredients_nouns = set()
def extract_ingredients_nouns(line):
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        if pair[1] == 'NN' or pair[1] == 'NNS':
            global ingredients_nouns
            ingredients_nouns |= {pair[0]}

In [23]:
for line in ingredients:
    quantity, measurement, ingredient_name, preparation = extract_all(line)
    descriptor = extract_descriptor(ingredient_name)
    extract_ingredients_nouns(ingredient_name)
    
    print(line)
    print('quantity:', quantity)
    print('measurement:', measurement)
    print('ingredient name:', ingredient_name)
    print('descriptor:', descriptor)
    print('preparation:', preparation)
    print()
    
#     print(check_noun_num(token_tag_pairs))
#     print()
print(ingredients_nouns)

1 tomato, chopped
quantity: 1
measurement: None
ingredient name: tomato
descriptor: None
preparation: chopped

1 clove garlic, minced, or more to taste
quantity: 1 minced, or more to taste
measurement: None
ingredient name: clove garlic
descriptor: None
preparation: None

1 (14 ounce) package vegetarian ground beef substitute
quantity: 1 (14 ounce)
measurement: package
ingredient name: vegetarian ground beef substitute
descriptor: vegetarian ground substitute
preparation: None

Bottom layer:
quantity: 
measurement: None
ingredient name: Bottom layer:
descriptor: None
preparation: None

1/4 cup olive oil
quantity: 1/4
measurement: cup
ingredient name: olive oil
descriptor: None
preparation: None

1 large yellow onion, chopped
quantity: 1
measurement: None
ingredient name: large yellow onion
descriptor: large yellow
preparation: chopped

2 carrots, chopped
quantity: 2
measurement: None
ingredient name: carrots
descriptor: None
preparation: chopped

3 tablespoons vegan cream cheese substi

# Tools

In [24]:
from nltk import sent_tokenize

directions_nouns = set()
# adjust for special case
directions_nouns |= {'oven'}

def extract_directions_nouns(line):
    direction_nouns_local = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # avoid case like 'degrees C'
        if len(pair[0]) > 1:
            if pair[1] == 'NN' or pair[1] == 'NNS':
                direction_nouns_local |= {pair[0]}
    global directions_nouns
    directions_nouns |= direction_nouns_local
    return direction_nouns_local

In [25]:
for direction in directions:
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        print(sentence)
        extract_directions_nouns(sentence)
    print('---------')
    
print(directions_nouns)

Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat.
Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.
---------
Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy.
Set the potatoes aside.
---------
Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.
---------
Heat the vegetable oil in a large skillet over medium heat, and cook and stir the onion, carrots, celery, frozen peas, and tomato until softened, about 10 minutes.
Stir in the Italian seasoning, garlic, and pepper.
---------
Reduce the heat to medium-low, and crumble the vegetarian ground beef substitute into the skillet with the vegetables.
Cook and stir, breaking up the meat substitute, until the mixture is hot, about 5 minutes.
---------
Spread the vegetarian meat substitute mixture into the bottom of t

In [26]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# test
stemmer.stem('Sponges')

'spong'

In [27]:
def retrieve_tool_set():
    url = 'https://www.mealime.com/kitchen-essentials-list'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tools = [element.text for element in soup.find_all(class_='anchor-button')]
    print(tools)
    tools = set([stemmer.stem(tool.split()[-1].strip()) for tool in tools])
    print(tools)
    return tools

In [28]:
tools = retrieve_tool_set()

["Chef's Knife", 'Cutting board', 'Can Opener', 'Measuring Cups', 'Measuring Spoons', 'Mixing Bowls', 'Colander', 'Vegetable Peeler', 'Potato Masher', 'Whisk', 'Salad Spinner', 'Grater', 'Shears', 'Citrus Juicer', 'Garlic Press', 'Paring Knife', 'Bread Knife', 'Honing Steel', 'Knife Sharpener', 'Stainless Steel Skillet', 'Sauté Pan', 'Small Saucepan', 'Medium Saucepan', 'Large Pot', 'Cast Iron Skillet', 'Grill Pan', 'Baking Sheet Pan', 'Casserole Dish', 'Broiler Pan', 'Stockpot', 'Spatula', 'Stirring Spoon', 'Tongs', 'Ladle', 'Oven Mitts', 'Trivet', 'Splatter Guard', 'Thermometer', 'Immersion Blender', 'Kitchen Scale', 'Blender', 'Food Storage Containers', 'Aluminum Foil', 'Parchment Paper', 'Towels', 'Sponges', 'Heavy Sponges', 'Dish Rack', 'Ice Cube Tray', 'Small Trash Bags', 'Large Trash Bags', 'Small Trash Bin', 'Large Trash Bin']
{'dish', 'pan', 'thermomet', 'bag', 'coland', 'stockpot', 'board', 'rack', 'skillet', 'masher', 'ladl', 'foil', 'spinner', 'knife', 'shear', 'press', 'tr

In [29]:
def extract_tools(directions_nouns, tools):
    directions_tools = set()
    for noun in directions_nouns:
        if stemmer.stem(noun) in tools:
            directions_tools |= {noun}
    return directions_tools

extract_tools(directions_nouns, tools)

{'dish', 'masher', 'pot', 'skillet'}

# Methods

In [30]:
def retrieve_methods_set():
    url = 'https://www.thedailymeal.com/cook/15-basic-cooking-methods-you-need-know-slideshow/slide-13'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    methods = [element.h2.text for element in soup.find_all(class_='image-title slide-title')]
    print(methods)
    methods = set([stemmer.stem(method.strip()) for method in methods])
    return methods

In [31]:
methods = retrieve_methods_set()
methods

['Grill', 'Pan-Fry', 'Deep-Fry', 'Sauté', 'Boil', 'Roast', 'Bake', 'Sear', 'Poach', 'Simmer', 'Broil  ', 'Steam', 'Blanch', 'Braise', 'Stew']


{'bake',
 'blanch',
 'boil',
 'brais',
 'broil',
 'deep-fri',
 'grill',
 'pan-fri',
 'poach',
 'roast',
 'sauté',
 'sear',
 'simmer',
 'steam',
 'stew'}

In [32]:
def retrieve_other_methods_set():
    url = 'https://en.wikibooks.org/wiki/Cookbook:Cooking_Techniques'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    unwanted = ['Contents', '[', 'edit', ']', '\n']
    other_methods = set()
    dump = soup.find_all(class_='mw-parser-output')
    for i in dump:
        for j in i.contents:
            if hasattr(j, 'contents'):
                for k in j.contents:
                    if hasattr(k, 'contents'):
                        for l in k.contents:
                            if hasattr(l, 'contents'):
                                for method in l:
#                                     print(method.string)
                                    if method.string is not None and method not in unwanted:
                                        other_methods |= {stemmer.stem(method.string.split()[-1])}
    return other_methods

In [33]:
other_methods = retrieve_other_methods_set()
# remove uncessary methods after complexity reduction
other_methods.remove('cook')
other_methods.remove('chocol')

In [34]:
# combine primary cooking methods with other cooking methods
methods |= other_methods
methods

{'bain-mari',
 'bake',
 'barbecu',
 'bast',
 'blanch',
 'boil',
 'bone',
 'brais',
 'brine',
 'broil',
 'can',
 'caramel',
 'chiffonad',
 'chop',
 'cockaign',
 'cream',
 'cube',
 'deep-fri',
 'deglaz',
 'degorg',
 'dredg',
 'dri',
 'ferment',
 'fri',
 'grill',
 'julien',
 'marin',
 'minc',
 'pan-fri',
 'pickl',
 'poach',
 'roast',
 'rub',
 'sauté',
 'scald',
 'sear',
 'shir',
 'simmer',
 'skill',
 'slice',
 'smoke',
 'sous-vid',
 'steam',
 'stew',
 'stir-fri',
 'storag',
 'temper',
 'test'}

In [35]:
directions_verbs = set()
def extract_directions_verbs(line):
    direction_verbs_local = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:    
        if pair[1] == 'VB':
            direction_verbs_local |= {pair[0]}
    global directions_verbs
    directions_verbs |= direction_verbs_local
    return direction_verbs_local

In [36]:
for direction in directions:
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        print(sentence)
        extract_directions_verbs(sentence)
    print('---------')
    
print(directions_verbs)

Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat.
Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.
---------
Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy.
Set the potatoes aside.
---------
Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.
---------
Heat the vegetable oil in a large skillet over medium heat, and cook and stir the onion, carrots, celery, frozen peas, and tomato until softened, about 10 minutes.
Stir in the Italian seasoning, garlic, and pepper.
---------
Reduce the heat to medium-low, and crumble the vegetarian ground beef substitute into the skillet with the vegetables.
Cook and stir, breaking up the meat substitute, until the mixture is hot, about 5 minutes.
---------
Spread the vegetarian meat substitute mixture into the bottom of t

In [37]:
def extract_methods(directions_verbs, methods):
    directions_methods = set()
    for verb in directions_verbs:
        if stemmer.stem(verb) in methods:
            directions_methods |= {verb}
    return directions_methods

extract_methods(directions_verbs, methods)

{'boil'}

# Steps

In [38]:
average_cook_time_per_step = round(convert_to_minutes(cook_time) / (len(directions) - 1))

for i, direction in enumerate(directions):
    print('Step:', i+1)
    print('Direction:', direction)
    if i == 0:
        print('prep time:', prep_time)
    else:
        print('average cook time: {0} m'.format(average_cook_time_per_step))
        
    single_direction_tools = set()
    single_direction_methods = set()
    single_direction_ingredients = set()
    
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        single_direction_tools |= extract_tools(extract_directions_nouns(sentence), tools)
        single_direction_methods |= extract_methods(extract_directions_verbs(sentence), methods)
        single_direction_ingredients |= extract_tools(extract_directions_nouns(sentence), ingredients_nouns)
    
    if len(single_direction_tools) > 0:
        print('tools:', single_direction_tools)
    if len(single_direction_methods) > 0:
        print('methods:', single_direction_methods)
    if len(single_direction_ingredients) > 0:
        print('ingredients:', single_direction_ingredients)
    print('---------')

Step: 1
Direction: Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat. Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.
prep time: 20 m
tools: {'pot'}
methods: {'boil'}
ingredients: {'potatoes'}
---------
Step: 2
Direction: Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy. Set the potatoes aside.
average cook time: 9 m
tools: {'masher'}
ingredients: {'oil', 'potatoes', 'milk', 'potato', 'vegan', 'soy'}
---------
Step: 3
Direction: Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.
average cook time: 9 m
tools: {'dish'}
---------
Step: 4
Direction: Heat the vegetable oil in a large skillet over medium heat, and cook and stir the onion, carrots, celery, frozen peas, and tomato until softened, about 10 minutes. Stir in the Italian seasoning, garlic, and pepper