# Ingredients

In [195]:
import re
import pickle

import spacy
import requests
from bs4 import BeautifulSoup

In [196]:
# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [197]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'
url = 'https://www.allrecipes.com/recipe/180735/traditional-style-vegan-shepherds-pie/'

In [198]:
def extract_time(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    times = set([element.text.strip() for element in soup.find_all(class_='prepTime__item')])
    # remove uncessary elements
    times.remove('')
    for time in times:
        if 'prep' in time.lower():
            prep_time = time[4:]
        if 'cook' in time.lower():
            cook_time = time[4:]
    return prep_time, cook_time

# test
print('prep_time, cook_time:', extract_time(url)) 

prep_time, cook_time: ('20 m', '55 m')


In [199]:
def convert_to_minutes(time):
    if 'h' in cook_time:
        hour_index = cook_time.index('h')
        hours = int(cook_time[:hour_index].strip())
        minutes = int(cook_time[hour_index+1 : -1].strip())
    else:
        hours = 0
        minutes = int(cook_time[: -1].strip())
    return 60*hours + minutes

In [200]:
def get_recipe_name(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup.find_all("h1", {"class": "recipe-summary__h1"})[0].text

# test
get_recipe_name(url)

"Traditional Style Vegan Shepherd's Pie"

In [201]:
def get_ingredient_list_and_directions(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    # extract ingredients section from the webpage
    ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
    # remove unnecessary elements
    unnecessary = ['', 'Add all ingredients to list']
    for i in unnecessary:
        if i in ingredients:
            ingredients.remove(i)
            
    # extract directions section from the webpage
    directions = [element.text.strip() for element in soup.find_all(class_='recipe-directions__list--item')]
    # remove unnecessary elements
    directions.remove('')
    return ingredients, directions

# test
ingredients, directions = get_ingredient_list_and_directions(url)
print(directions)
ingredients

['Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat. Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.', 'Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy. Set the potatoes aside.', 'Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.', 'Heat the vegetable oil in a large skillet over medium heat, and cook and stir the onion, carrots, celery, frozen peas, and tomato until softened, about 10 minutes. Stir in the Italian seasoning, garlic, and pepper.', 'Reduce the heat to medium-low, and crumble the vegetarian ground beef substitute into the skillet with the vegetables. Cook and stir, breaking up the meat substitute, until the mixture is hot, about 5 minutes.', 'Spread the vegetarian meat substitute mixture into the bottom of the baking dish, and top with the 

{'1 (14 ounce) package vegetarian ground beef substitute',
 '1 clove garlic, minced, or more to taste',
 '1 large yellow onion, chopped',
 '1 pinch ground black pepper to taste',
 '1 tablespoon vegetable oil',
 '1 teaspoon Italian seasoning',
 '1 tomato, chopped',
 '1/2 cup frozen peas',
 '1/2 cup shredded Cheddar-style soy cheese',
 '1/2 cup soy milk',
 '1/2 cup vegan mayonnaise',
 '1/4 cup olive oil',
 '2 carrots, chopped',
 '2 teaspoons salt',
 '3 stalks celery, chopped',
 '3 tablespoons vegan cream cheese substitute (such as Tofutti ®)',
 '5 russet potatoes, peeled and cut into 1-inch cubes',
 'Bottom layer:',
 'Mashed potato layer:'}

In [202]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

def nouns_only(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if not (pair[1] == "NN" or pair[1] == "NNS"):
            return False
    return True

In [310]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length and 'abc' does not contain brackets
    pattern = re.compile(r'\([^\(\)]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        # if no numerical value or line_split length > 3 
        if not any(char.isdigit() for char in match[0]) or len(match[0].split()) > 3:
            return None
        return match
    
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r'[\b]?, [^\(\)]*')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match
    
def extract_all(line):
#     print(line)
    type_exceptions = ['can', 'tablespoon', 'oz', 'clove']
    quantity_split = []
    measurement = None
    
    # extract preparation
    line = line.replace(' -', ',')
    preparation = extract_preparation(line)
    if preparation:
#         print('preparation[0]: {0}|'.format(preparation[0]))
        preparation = preparation[0].strip()
        line = re.sub(r'{0}'.format(preparation), '', line)
        # remove ', ' prefix
        preparation = preparation[2:]
        
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    line_split = line.split()
    # extract quantity from the first word if the word contains a digit
    if any(char.isdigit() for char in line_split[0]):
        quantity_split.append(line_split[0])
    
        # extract quantity from the second word if the word contains a digit
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            # check measurement type
            # to avoid case like '1 large tomato, seeded and chopped'
            if nouns_only(line_split[2]) or line_split[2] in type_exceptions:
                measurement = line_split[2]
        else:
            # check line_split length for case like '1 egg' or '1/2 onion, chopped'
            if len(line_split) > 2 and (nouns_only(line_split[1]) or line_split[1] in type_exceptions):
                measurement = line_split[1]
        line = re.sub(r'{0}'.format(' '.join(quantity_split)), '', line)
    
    if measurement:
        line = re.sub(r'{0}'.format(measurement), '', line)
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)
    quantity = ' '.join(quantity_split)
    
    # extract ingredient name
    line = re.sub(r'[ ]?®', '', line)
    line = re.sub(r'[ ]?™', '', line)
    ingredient_name = line.strip()
    
    # extract descriptor
    descriptor = extract_descriptor(ingredient_name)
    
    # remove descriptor if not None
    ingredient = ingredient_name.replace(descriptor, '').strip() if descriptor else ingredient_name
    # if ingredient is empty after removing descriptor
    if ingredient == '':
        ingredient = ingredient_name
    # remove ' to taste' in ingredient if any
    ingredient = re.sub(r'(or)? to taste', '', ingredient)
    ingredient = ' '.join(ingredient.split())
    
    # if 'or to taste' or 'or as needed' in preparation
    if preparation is not None and 'or ' in preparation:
        quantity += ' ' + preparation
        preparation = None    
    
    return quantity, measurement, descriptor, ingredient, preparation

In [311]:
def extract_descriptor(ingredient_name):
    type_exceptions = ['parsley', 'garlic', 'chili']
    descriptor = []
    token_tag_pairs = []
    
    for element in ingredient_name.split():
        # treat compound word with hyphen as an adjective
        if '-' in element:
            token_tag_pairs.append((element, 'JJ'))
        else:
            token_tag_pairs.append([(token.text, token.tag_) for token in nlp(element)][0])
    
    for pair in token_tag_pairs:
        # if the word is an adjective, an adverb, or a past participle of a verb, or exception like 'ground'
        if pair[1] == "JJ" or pair[1] == "RB" or pair[1] == "VBN" or pair[0] == 'ground':
            if pair[0] not in type_exceptions:
                descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [312]:
tokenize('1 tablespoon extra virgin olive oil')
tokenize('5 russet potatoes, peeled and cut into 1-inch cubes')

[('5', 'CD'),
 ('russet', 'NN'),
 ('potatoes', 'NNS'),
 (',', ','),
 ('peeled', 'VBD'),
 ('and', 'CC'),
 ('cut', 'VBN'),
 ('into', 'IN'),
 ('1-inch', 'JJ'),
 ('cubes', 'NNS')]

In [313]:
spacy.explain('VBZ')

'verb, 3rd person singular present'

In [314]:
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def tokenize_nltk(line):
    tokens = nltk.word_tokenize(line)
    token_tag_pairs = nltk.pos_tag(tokens)
    return token_tag_pairs

In [316]:
def decompose_ingredients(ingredients):
    for line in ingredients:
        # exceptions like "topping:"
        if ':' in line:
            continue
            
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
            
        print(line)
        print('\t quantity   :', quantity)
        print('\t measurement:', measurement)
        print('\t descriptor :', descriptor)
        print('\t ingredient :', ingredient)
        print('\t preparation:', preparation)
        print()

# test
decompose_ingredients(ingredients)

3 tablespoons vegan cream cheese substitute (such as Tofutti ®)
	 quantity   : 3
	 measurement: tablespoons
	 descriptor : substitute
	 ingredient : vegan cream cheese (such as Tofutti)
	 preparation: None

1 clove garlic, minced, or more to taste
	 quantity   : 1 minced, or more to taste
	 measurement: clove
	 descriptor : None
	 ingredient : garlic
	 preparation: None

1/4 cup olive oil
	 quantity   : 1/4
	 measurement: cup
	 descriptor : None
	 ingredient : olive oil
	 preparation: None

1/2 cup shredded Cheddar-style soy cheese
	 quantity   : 1/2
	 measurement: cup
	 descriptor : shredded Cheddar-style
	 ingredient : soy cheese
	 preparation: None

2 teaspoons salt
	 quantity   : 2
	 measurement: teaspoons
	 descriptor : None
	 ingredient : salt
	 preparation: None

1 pinch ground black pepper to taste
	 quantity   : 1
	 measurement: pinch
	 descriptor : ground black
	 ingredient : pepper
	 preparation: None

5 russet potatoes, peeled and cut into 1-inch cubes
	 quantity   : 5
	 me

# Tools

In [317]:
from nltk import sent_tokenize

In [318]:
def extract_directions_nouns(directions):
    directions_nouns = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
            # check for special cases where spaCy cannot recognize well
            if ' oven' in sentence:
                directions_nouns |= {'oven'}
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                # avoid case like 'degrees C'
                if len(pair[0]) > 1:
                    if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                        directions_nouns |= {pair[0]}
#         print('---------')
    return directions_nouns

    
extract_directions_nouns(directions)

{'Bake',
 'baking',
 'beef',
 'boil',
 'bottom',
 'carrots',
 'casserole',
 'celery',
 'cheese',
 'cooking',
 'cream',
 'degrees',
 'dish',
 'heat',
 'layer',
 'masher',
 'mayonnaise',
 'meat',
 'medium',
 'milk',
 'minutes',
 'mixture',
 'oil',
 'olive',
 'onion',
 'oven',
 'peas',
 'pepper',
 'pot',
 'potato',
 'potatoes',
 'seasoning',
 'skillet',
 'soy',
 'spray',
 'substitute',
 'tender',
 'tomato',
 'top',
 'vegan',
 'vegetable',
 'vegetables',
 'water'}

In [319]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# test
stemmer.stem('Sponges')

'spong'

In [320]:
def retrieve_tools_set():
    try:
        with open('tools.pickle', 'rb') as file:
            tools = pickle.load(file)
            print('loaded tools set successfully')
    except:
        url = 'https://www.mealime.com/kitchen-essentials-list'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        tools = [element.text for element in soup.find_all(class_='anchor-button')]
        # reduce each tool to its last word
        tools = set([stemmer.stem(tool.split()[-1].strip()) for tool in tools])
        
        # save retrieved data
        with open('tools.pickle', 'wb') as file:
            pickle.dump(tools, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    return tools

In [321]:
def extract_tools(directions_nouns):
    tools = retrieve_tools_set()
    directions_tools = set()
    for noun in directions_nouns:
        if stemmer.stem(noun) in tools:
            directions_tools |= {noun}
    return directions_tools

extract_tools(directions_nouns)

loaded tools set successfully


{'dish', 'masher', 'pot', 'skillet'}

# Methods

In [322]:
def retrieve_cooking_methods_set():
    try:
        with open('cooking_methods.pickle', 'rb') as file:
            cooking_methods = pickle.load(file)
            print('loaded cooking_methods set successfully')
    except:
        url = 'https://www.thedailymeal.com/cook/15-basic-cooking-methods-you-need-know-slideshow/slide-13'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        cooking_methods = [element.h2.text for element in soup.find_all(class_='image-title slide-title')]
        cooking_methods = set([stemmer.stem(method.strip()) for method in cooking_methods])
        
        # save retrieved data
        with open('cooking_methods.pickle', 'wb') as file:
            pickle.dump(cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return cooking_methods

# test
methods = retrieve_cooking_methods_set()
methods

loaded cooking_methods set successfully


{'bake',
 'blanch',
 'boil',
 'brais',
 'broil',
 'deep-fri',
 'grill',
 'pan-fri',
 'poach',
 'roast',
 'sauté',
 'sear',
 'simmer',
 'steam',
 'stew'}

In [323]:
def retrieve_other_cooking_methods_set():
    try:
        with open('other_cooking_methods.pickle', 'rb') as file:
            other_cooking_methods = pickle.load(file)
            print('loaded other_cooking_methods set successfully')
    except:
        url = 'https://en.wikibooks.org/wiki/Cookbook:Cooking_Techniques'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        unwanted = ['Contents', '[', 'edit', ']', '\n']
        other_cooking_methods = set()
        dump = soup.find_all(class_='mw-parser-output')
        for i in dump:
            for j in i.contents:
                if hasattr(j, 'contents'):
                    for k in j.contents:
                        if hasattr(k, 'contents'):
                            for l in k.contents:
                                if hasattr(l, 'contents'):
                                    for method in l:
    #                                     print(method.string)
                                        if method.string is not None and method not in unwanted:
                                            other_cooking_methods |= {stemmer.stem(method.string.split()[-1])}

        # remove uncessary methods after complexity reduction
        other_cooking_methods.remove('cook')
        other_cooking_methods.remove('chocol')
        
        # save retrieved data
        with open('other_cooking_methods.pickle', 'wb') as file:
            pickle.dump(other_cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return other_cooking_methods

# test
other_methods = retrieve_other_cooking_methods_set()
other_methods

loaded other_cooking_methods set successfully


{'bain-mari',
 'bake',
 'barbecu',
 'bast',
 'blanch',
 'boil',
 'bone',
 'brine',
 'can',
 'caramel',
 'chiffonad',
 'chop',
 'cockaign',
 'cream',
 'cube',
 'deglaz',
 'degorg',
 'dredg',
 'dri',
 'ferment',
 'fri',
 'grill',
 'julien',
 'marin',
 'minc',
 'pan-fri',
 'pickl',
 'poach',
 'roast',
 'rub',
 'sauté',
 'scald',
 'shir',
 'simmer',
 'skill',
 'slice',
 'smoke',
 'sous-vid',
 'steam',
 'stew',
 'stir-fri',
 'storag',
 'temper',
 'test'}

In [324]:
def extract_directions_verbs(directions):
    directions_verbs = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                if pair[1] == 'VB':
                    directions_verbs |= {pair[0]}
#         print('---------')
    return directions_verbs

extract_directions_verbs(directions)

{'Cook',
 'Heat',
 'Place',
 'Preheat',
 'Reduce',
 'Set',
 'Spread',
 'Sprinkle',
 'Stir',
 'Turn',
 'boil',
 'bring',
 'cook',
 'cover',
 'drain',
 'mash',
 'salt',
 'spray',
 'stir'}

In [325]:
def extract_methods(directions_verbs):
    methods = retrieve_cooking_methods_set()
    other_methods = retrieve_other_cooking_methods_set()
    methods |= other_methods
    directions_methods = set()
    for verb in directions_verbs:
        if stemmer.stem(verb) in methods:
            directions_methods |= {verb}
    return directions_methods

extract_methods(directions_verbs)

loaded cooking_methods set successfully
loaded other_cooking_methods set successfully


{'boil'}

# Steps

In [326]:
def extract_directions_ingredients(ingredients):
    ingredients_nouns = set()
    for line in ingredients:
        if ':' in line:
            continue
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
        ingredients_nouns |= {ingredient}
        # for better granularity, in case full name is not mentioned
        token_tag_pairs = tokenize(ingredient)
        for pair in token_tag_pairs:    
            if len(pair[0]) > 1:
                if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                    ingredients_nouns |= {pair[0]}
    # start from the longest
    return sorted((list(ingredients_nouns)), key=len)[::-1]

extract_directions_ingredients(ingredients)

['vegan cream cheese (such as Tofutti)',
 'vegetarian ground beef substitute',
 'vegan mayonnaise',
 'soy cheese',
 'substitute',
 'mayonnaise',
 'seasoning',
 'olive oil',
 'potatoes',
 'soy milk',
 'carrots',
 'pepper',
 'tomato',
 'garlic',
 'cheese',
 'celery',
 'vegan',
 'cream',
 'onion',
 'olive',
 'milk',
 'salt',
 'peas',
 'beef',
 'oil',
 'soy']

In [308]:
def extract_ingredients(direction):
    ingredients_set = extract_directions_ingredients(ingredients)
    direction_ingredients = set()
    used = set()
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        for i in ingredients_set:
            if i in sentence and i not in used:
                direction_ingredients |= {i}
                for word in i.split():
                    used |= {word}
    return direction_ingredients

In [309]:
def decompose_steps():
    prep_time, cook_time = extract_time(url)
    average_cook_time_per_step = round(convert_to_minutes(cook_time) / (len(directions) - 1))

    for i, direction in enumerate(directions):
        print('Step:', i+1)
        print('Direction:', direction)
        if i == 0:
            print('\tprep time:', prep_time)
        else:
            print('\taverage cook time: {0} m'.format(average_cook_time_per_step))

        single_direction_tools = extract_tools(extract_directions_nouns(direction))
        single_direction_methods = extract_methods(extract_directions_verbs(direction))
        single_direction_ingredients = extract_ingredients(direction)

        if len(single_direction_tools) > 0:
            print('\ttools:', single_direction_tools)
        if len(single_direction_methods) > 0:
            print('\tmethods:', single_direction_methods)
        if len(single_direction_ingredients) > 0:
            print('\tingredients:', single_direction_ingredients)
        print('---------')

decompose_steps()

Step: 1
Direction: Place the potatoes in a pot, cover with cold water, and bring to a boil over medium-high heat. Turn the heat to medium-low, and boil the potatoes until tender, about 25 minutes; drain.
	prep time: 20 m
loaded tools set successfully
loaded cooking_methods set successfully
loaded other_cooking_methods set successfully
	tools: {'pot'}
	methods: {'boil'}
	ingredients: {'potatoes', 'oil'}
---------
Step: 2
Direction: Stir the vegan mayonnaise, soy milk, olive oil, vegan cream cheese, and salt into the potatoes, and mash with a potato masher until smooth and fluffy. Set the potatoes aside.
	average cook time: 9 m
loaded tools set successfully
loaded cooking_methods set successfully
loaded other_cooking_methods set successfully
	tools: {'masher'}
	ingredients: {'soy milk', 'salt', 'olive oil', 'cheese', 'potatoes', 'vegan mayonnaise'}
---------
Step: 3
Direction: Preheat oven to 400 degrees F (200 degrees C), and spray a 2-quart baking dish with cooking spray.
	average cook