# Ingredients

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
# extract ingredients section from the webpage
ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
ingredients

{'',
 '1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles',
 'Add all ingredients to list'}

In [5]:
# remove unnecessary elements
ingredients.remove('')
ingredients.remove('Add all ingredients to list')

In [6]:
ingredients

{'1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles'}

In [7]:
# extract directions section from the webpage
directions = [element.text.strip() for element in soup.find_all(class_='recipe-directions__list--item')]
directions

['Preheat oven to 350 degrees F (175 degrees C).',
 'In a large pot over medium heat saute spinach, onion, oregano, basil and garlic in the olive oil.  Pour in spaghetti sauce and water; simmer 20 minutes.  In a large bowl mix cottage cheese, mozzarella cheese, Parmesan cheese, parsley, salt, pepper and egg.',
 'Place a small amount of sauce in the bottom of a lasagna pan.  Place 4 uncooked noodles on top of sauce and top with layer of sauce.  Add 4 more noodles and layer with 1/2 sauce and 1/2 cheese mixture, noodles and repeat until all is layered, finishing with sauce.',
 'Cover with foil and bake in a preheated oven for 55 minutes. Remove foil and bake another 15 minutes.  Let sit 10 minutes before serving.',
 '']

In [8]:
# remove unnecessary elements
directions.remove('')

In [9]:
directions

['Preheat oven to 350 degrees F (175 degrees C).',
 'In a large pot over medium heat saute spinach, onion, oregano, basil and garlic in the olive oil.  Pour in spaghetti sauce and water; simmer 20 minutes.  In a large bowl mix cottage cheese, mozzarella cheese, Parmesan cheese, parsley, salt, pepper and egg.',
 'Place a small amount of sauce in the bottom of a lasagna pan.  Place 4 uncooked noodles on top of sauce and top with layer of sauce.  Add 4 more noodles and layer with 1/2 sauce and 1/2 cheese mixture, noodles and repeat until all is layered, finishing with sauce.',
 'Cover with foil and bake in a preheated oven for 55 minutes. Remove foil and bake another 15 minutes.  Let sit 10 minutes before serving.']

In [10]:
import re

In [11]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length
    pattern = re.compile(r'\([\w\s]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match

In [12]:
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r', [\w\s]*') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match

In [13]:
def extract_quantity_measurement_preparation(line):
    quantity_split = []
    measurement = None
    
    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:]  
    
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    # extract quantity from the first word
    line_split = line.split()
    quantity_split.append(line_split[0])
    
    # extract quantity from the second word if the word contains a digit
    if any(char.isdigit() for char in line_split[1]):
        quantity_split.append(line_split[1])
        measurement = line_split[2]
    else:
        # to adjust for case like '1 egg' or '1/2 onion, chopped'
        if len(line_split) > 2:
            measurement = line_split[1]
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)

    return ' '.join(quantity_split), measurement, preparation

In [14]:
def extract_ingredient_name(line):
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    if measurement is None:
        measurement = ''
        
    if preparation:
        return line[len(quantity + ' ' + measurement) : -(len(preparation) + 2)].strip()
    else:
        return line[len(quantity + ' ' + measurement):].strip()

In [15]:
extract_ingredient_name('1/2 onion, chopped')

'onion'

In [16]:
import spacy

# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [17]:
tokenize('1 tablespoon extra virgin olive oil')

[('1', 'CD'),
 ('tablespoon', 'VBP'),
 ('extra', 'JJ'),
 ('virgin', 'JJ'),
 ('olive', 'NN'),
 ('oil', 'NN')]

In [18]:
spacy.explain('VBP')

'verb, non-3rd person singular present'

In [19]:
import nltk
# nltk.download('averaged_perceptron_tagger')

def tokenize_nltk(line):
    tokens = nltk.word_tokenize(line)
    token_tag_pairs = nltk.pos_tag(tokens)
    return token_tag_pairs

In [20]:
tokenize('1 tablespoon extra virgin olive oil')

[('1', 'CD'),
 ('tablespoon', 'VBP'),
 ('extra', 'JJ'),
 ('virgin', 'JJ'),
 ('olive', 'NN'),
 ('oil', 'NN')]

In [21]:
def extract_descriptor(ingredient_name):
    descriptor = []
    token_tag_pairs = tokenize(ingredient_name)
    for pair in token_tag_pairs:
        if pair[1] == "JJ" or pair[1] == "VBN":
            descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [22]:
def check_noun_num(token_tag_pairs):
    tag_num_dict = {}
    for pair in token_tag_pairs:
        if pair[1] not in tag_num_dict:
            tag_num_dict[pair[1]] = 1
        else:
            tag_num_dict[pair[1]] += 1
    
    criterion_1 = 'NN' in tag_num_dict and tag_num_dict['NN'] >= 2
    criterion_2 = 'NNS' in tag_num_dict and tag_num_dict['NNS'] >= 2
    criterion_3 = 'NN' in tag_num_dict and 'NNS' in tag_num_dict and tag_num_dict['NN'] + tag_num_dict['NNS'] >= 2
    
    if criterion_1 or criterion_2 or criterion_3:
        return True
    else:
        return False

In [23]:
ingredients_nouns = set()
def extract_ingredients_nouns(line):
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        if pair[1] == 'NN' or pair[1] == 'NNS':
            global ingredients_nouns
            ingredients_nouns |= {pair[0]}

In [24]:
for line in ingredients:
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    ingredient_name = extract_ingredient_name(line)
    descriptor = extract_descriptor(ingredient_name)
    extract_ingredients_nouns(line)
    
    print(line)
    print('ingredient name:', ingredient_name)
    print('descriptor:', descriptor)
    print('quantity:', quantity)
    print('measurement:', measurement)
    print('preparation:', preparation)
    print()
    
#     token_tag_pairs = tokenize(line)
#     print(token_tag_pairs)
#     print(check_noun_num(token_tag_pairs))
#     print()
print(ingredients_nouns)

8 ounces lasagna noodles
ingredient name: lasagna noodles
descriptor: None
quantity: 8
measurement: ounces
preparation: None

2 (10 ounce) packages frozen chopped spinach
ingredient name: frozen chopped spinach
descriptor: frozen
quantity: 2 (10 ounce)
measurement: packages
preparation: None

1 egg
ingredient name: egg
descriptor: None
quantity: 1
measurement: None
preparation: None

2 cups non-fat cottage cheese
ingredient name: non-fat cottage cheese
descriptor: None
quantity: 2
measurement: cups
preparation: None

1 teaspoon salt
ingredient name: salt
descriptor: None
quantity: 1
measurement: teaspoon
preparation: None

1/2 onion, chopped
ingredient name: onion
descriptor: None
quantity: 1/2
measurement: None
preparation: chopped

1/2 teaspoon dried basil
ingredient name: dried basil
descriptor: dried
quantity: 1/2
measurement: teaspoon
preparation: None

1 1/2 cups water
ingredient name: water
descriptor: None
quantity: 1 1/2
measurement: cups
preparation: None

1 (32 ounce) jar sp

# Tools

In [25]:
from nltk import sent_tokenize

directions_nouns = set()
# adjust for special case
directions_nouns |= {'oven'}

def extract_directions_nouns(line):
    direction_nouns_local = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # avoid case like 'degrees C'
        if len(pair[0]) > 1:
            if pair[1] == 'NN' or pair[1] == 'NNS':
                direction_nouns_local |= {pair[0]}
    global directions_nouns
    directions_nouns |= direction_nouns_local
    return direction_nouns_local

In [27]:
for direction in directions:
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        print(sentence)
        extract_directions_nouns(sentence)
    print('---------')
    
print(directions_nouns)

Preheat oven to 350 degrees F (175 degrees C).
---------
In a large pot over medium heat saute spinach, onion, oregano, basil and garlic in the olive oil.
Pour in spaghetti sauce and water; simmer 20 minutes.
In a large bowl mix cottage cheese, mozzarella cheese, Parmesan cheese, parsley, salt, pepper and egg.
---------
Place a small amount of sauce in the bottom of a lasagna pan.
Place 4 uncooked noodles on top of sauce and top with layer of sauce.
Add 4 more noodles and layer with 1/2 sauce and 1/2 cheese mixture, noodles and repeat until all is layered, finishing with sauce.
---------
Cover with foil and bake in a preheated oven for 55 minutes.
Remove foil and bake another 15 minutes.
Let sit 10 minutes before serving.
---------
{'onion', 'oil', 'mozzarella', 'pot', 'cheese', 'Place', 'noodles', 'bake', 'lasagna', 'spinach', 'layer', 'top', 'salt', 'basil', 'spaghetti', 'oven', 'heat', 'saute', 'minutes', 'pepper', 'garlic', 'degrees', 'oregano', 'water', 'sauce', 'pan', 'mixture', 

In [28]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# test
stemmer.stem('Sponges')

'spong'

In [29]:
def retrieve_tool_set():
    url = 'https://www.mealime.com/kitchen-essentials-list'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tools = [element.text for element in soup.find_all(class_='anchor-button')]
    print(tools)
    tools = set([stemmer.stem(tool.split()[-1].strip()) for tool in tools])
    print(tools)
    return tools

In [30]:
tools = retrieve_tool_set()

["Chef's Knife", 'Cutting board', 'Can Opener', 'Measuring Cups', 'Measuring Spoons', 'Mixing Bowls', 'Colander', 'Vegetable Peeler', 'Potato Masher', 'Whisk', 'Salad Spinner', 'Grater', 'Shears', 'Citrus Juicer', 'Garlic Press', 'Paring Knife', 'Bread Knife', 'Honing Steel', 'Knife Sharpener', 'Stainless Steel Skillet', 'Sauté Pan', 'Small Saucepan', 'Medium Saucepan', 'Large Pot', 'Cast Iron Skillet', 'Grill Pan', 'Baking Sheet Pan', 'Casserole Dish', 'Broiler Pan', 'Stockpot', 'Spatula', 'Stirring Spoon', 'Tongs', 'Ladle', 'Oven Mitts', 'Trivet', 'Splatter Guard', 'Thermometer', 'Immersion Blender', 'Kitchen Scale', 'Blender', 'Food Storage Containers', 'Aluminum Foil', 'Parchment Paper', 'Towels', 'Sponges', 'Heavy Sponges', 'Dish Rack', 'Ice Cube Tray', 'Small Trash Bags', 'Large Trash Bags', 'Small Trash Bin', 'Large Trash Bin']
{'dish', 'bag', 'spinner', 'stockpot', 'shear', 'grater', 'board', 'knife', 'thermomet', 'guard', 'peeler', 'saucepan', 'steel', 'pan', 'tong', 'bowl', '

In [31]:
def extract_tools(directions_nouns, tools):
    directions_tools = set()
    for noun in directions_nouns:
        if stemmer.stem(noun) in tools:
            directions_tools |= {noun}
    return directions_tools

extract_tools(directions_nouns, tools)

{'bowl', 'foil', 'pan', 'pot'}

# Methods

In [32]:
def retrieve_method_set():
    url = 'https://www.thedailymeal.com/cook/15-basic-cooking-methods-you-need-know-slideshow/slide-13'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    methods = [element.h2.text for element in soup.find_all(class_='image-title slide-title')]
    print(methods)
    methods = set([stemmer.stem(method.split('-')[-1].strip()) for method in methods])
    print(methods)
    return methods

In [33]:
methods = retrieve_method_set()

['Grill', 'Pan-Fry', 'Deep-Fry', 'Sauté', 'Boil', 'Roast', 'Bake', 'Sear', 'Poach', 'Simmer', 'Broil  ', 'Steam', 'Blanch', 'Braise', 'Stew']
{'fri', 'poach', 'grill', 'sauté', 'bake', 'steam', 'boil', 'sear', 'broil', 'stew', 'simmer', 'roast', 'blanch', 'brais'}


In [34]:
directions_verbs = set()
def extract_directions_verbs(line):
    direction_verbs_local = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:    
        if pair[1] == 'VB':
            direction_verbs_local |= {pair[0]}
    global directions_verbs
    directions_verbs |= direction_verbs_local
    return direction_verbs_local

In [35]:
for direction in directions:
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        print(sentence)
        extract_directions_verbs(sentence)
    print('---------')
    
print(directions_verbs)

Preheat oven to 350 degrees F (175 degrees C).
---------
In a large pot over medium heat saute spinach, onion, oregano, basil and garlic in the olive oil.
Pour in spaghetti sauce and water; simmer 20 minutes.
In a large bowl mix cottage cheese, mozzarella cheese, Parmesan cheese, parsley, salt, pepper and egg.
---------
Place a small amount of sauce in the bottom of a lasagna pan.
Place 4 uncooked noodles on top of sauce and top with layer of sauce.
Add 4 more noodles and layer with 1/2 sauce and 1/2 cheese mixture, noodles and repeat until all is layered, finishing with sauce.
---------
Cover with foil and bake in a preheated oven for 55 minutes.
Remove foil and bake another 15 minutes.
Let sit 10 minutes before serving.
---------
{'Cover', 'Pour', 'Place', 'bake', 'Let', 'Add', 'repeat', 'Remove', 'simmer', 'sit', 'Preheat'}


In [36]:
def extract_methods(directions_verbs, methods):
    directions_methods = set()
    for verb in directions_verbs:
        if stemmer.stem(verb) in methods:
            directions_methods |= {verb}
    return directions_methods

extract_methods(directions_verbs, methods)

{'bake', 'simmer'}

# Steps

In [37]:
for i, direction in enumerate(directions):
    print('Step:', i+1)
    print('Direction:', direction)

    single_direction_tools = set()
    single_direction_methods = set()
    single_direction_ingredients = set()
    
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        single_direction_tools |= extract_tools(extract_directions_nouns(sentence), tools)
        single_direction_methods |= extract_methods(extract_directions_verbs(sentence), methods)
        single_direction_ingredients |= extract_tools(extract_directions_nouns(sentence), ingredients_nouns)
    if len(single_direction_tools) > 0:
        print('tools:', single_direction_tools)
    if len(single_direction_methods) > 0:
        print('methods:', single_direction_methods)
    if len(single_direction_ingredients) > 0:
        print('ingredients:', single_direction_ingredients)
    print('---------')

Step: 1
Direction: Preheat oven to 350 degrees F (175 degrees C).
---------
Step: 2
Direction: In a large pot over medium heat saute spinach, onion, oregano, basil and garlic in the olive oil.  Pour in spaghetti sauce and water; simmer 20 minutes.  In a large bowl mix cottage cheese, mozzarella cheese, Parmesan cheese, parsley, salt, pepper and egg.
tools: {'bowl', 'pot'}
methods: {'simmer'}
ingredients: {'onion', 'oil', 'pepper', 'mozzarella', 'salt', 'water', 'oregano', 'spinach', 'egg', 'basil', 'spaghetti'}
---------
Step: 3
Direction: Place a small amount of sauce in the bottom of a lasagna pan.  Place 4 uncooked noodles on top of sauce and top with layer of sauce.  Add 4 more noodles and layer with 1/2 sauce and 1/2 cheese mixture, noodles and repeat until all is layered, finishing with sauce.
tools: {'pan'}
ingredients: {'lasagna'}
---------
Step: 4
Direction: Cover with foil and bake in a preheated oven for 55 minutes. Remove foil and bake another 15 minutes.  Let sit 10 minute