# Ingredients

In [1]:
import re
import pickle

import nltk
# nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem import PorterStemmer
import spacy
import requests
import unidecode
from bs4 import BeautifulSoup

In [2]:
# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [3]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'
# url = 'https://www.allrecipes.com/recipe/246141/pad-thai-with-tofu/'
# url = 'https://www.allrecipes.com/recipe/221286/traditional-mexican-guacamole'
url = 'https://www.allrecipes.com/recipe/59661/spinach-enchiladas/'

In [4]:
def extract_time(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    times = set([element.text.strip() for element in soup.find_all(class_='prepTime__item')])
    # remove uncessary elements
    times.remove('')
    prep_time = None
    cook_time = None
    for time in times:
        if 'prep' in time.lower():
            prep_time = time[4:]
        if 'cook' in time.lower():
            cook_time = time[4:]
    return prep_time, cook_time

# test
prep_time, cook_time = extract_time(url)
print(prep_time)
print(cook_time)

20 m
20 m


In [5]:
def convert_to_minutes(cook_time):
    if cook_time is None:
        return 0
    if 'h' in cook_time:
        hour_index = cook_time.index('h')
        print(hour_index)
        hours = int(cook_time[:hour_index].strip())
        if 'm' in cook_time:
            minutes = int(cook_time[hour_index+1 : -1].strip())
        else:
            minutes = 0
    else:
        hours = 0
        minutes = int(cook_time[: -1].strip())
    return 60*hours + minutes

# test
convert_to_minutes(cook_time)

20

In [6]:
def get_recipe_name(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup.find_all("h1", {"class": "recipe-summary__h1"})[0].text

# test
get_recipe_name(url)

'Spinach Enchiladas'

In [7]:
def get_ingredient_list_and_directions(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # extract ingredients section from the webpage
    ingredients = [element.label.text.strip() for element in soup.find_all(class_='checkList__line')]
    
    # remove exceptions like 'topping:'
    for i in ingredients:
        if ':' in i:
            ingredients.remove(i)
    ingredients = set(ingredients)
    
    # remove unnecessary elements
    unnecessary = ['', 'Add all ingredients to list']
    for i in unnecessary:
        if i in ingredients:
            ingredients.remove(i)
            
    # extract directions section from the webpage
    directions = [element.text.strip() for element in soup.find_all(class_='recipe-directions__list--item')]
    # remove unnecessary elements
    directions.remove('')
    return ingredients, directions

# test
ingredients, directions = get_ingredient_list_and_directions(url)
print(directions)
ingredients

['Preheat the oven to 375 degrees F (190 degrees C).', 'Melt butter in a saucepan over medium heat. Add garlic and onion; cook for a few minutes until fragrant, but not brown. Stir in spinach, and cook for about 5 more minutes. Remove from the heat, and mix in ricotta cheese, sour cream, and 1 cup of Monterey Jack cheese.', 'In a skillet over medium heat, warm tortillas one at a time until flexible, about 15 seconds. Spoon about 1/4 cup of the spinach mixture onto the center of each tortilla. Roll up, and place seam side down in a 9x13 inch baking dish. Pour enchilada sauce over the top, and sprinkle with the remaining cup of Monterey Jack.', 'Bake for 15 to 20 minutes in the preheated oven, until sauce is bubbling and cheese is lightly browned at the edges.']


{'1 (10 ounce) package frozen chopped spinach , thawed, drained and squeezed dry',
 '1 (19 ounce) can enchilada sauce',
 '1 cup ricotta cheese',
 '1 tablespoon butter',
 '1/2 cup sliced green onions',
 '1/2 cup sour cream',
 '10 (6 inch) corn tortillas',
 '2 cloves garlic, minced',
 '2 cups shredded Monterey Jack cheese'}

In [8]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

def nouns_only(line):
    adjective_type_exceptions = ['ground', 'skinless', 'boneless']
    noun_type_exceptions = ['parsley', 'garlic', 'chili', 'chile', 'substitute', 'cream', 'flanken', 'cilantro', 'such']
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if (not (pair[1] == "NN" or pair[1] == "NNS") or pair[0] in adjective_type_exceptions) and pair[0] not in noun_type_exceptions:
            return False
    return True

In [9]:
def extract_brackets(line):
    # find '(abc)' where 'abc' is in arbitrary length and 'abc' does not contain brackets
    pattern = re.compile(r'\([^\(\)]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match
    
def extract_preparation(line):
    # find ', abc' or ' - abc' where 'abc' is in arbitrary length
#     match = re.findall(re.compile(r'\b[,-] [^\(\)]*'), line)
    match = re.findall(re.compile(r'[^.], .*| - .*'), line)
    if len(match) != 0:
        if match[-1][-1] == ')':
            return match[-1][1:-1]
        else:
            return match[-1][1:]
    
def extract_descriptor(ingredient_name):
    noun_type_exceptions = ['parsley', 'garlic', 'chili', 'chile', 'substitute', 'cream', 'flanken', 'such']
    adjective_type_exceptions = ['ground', 'skinless', 'boneless']
    descriptor = []
    token_tag_pairs = []
    
    for element in ingredient_name.split():
        # treat compound word with hyphen as an adjective
        if '-' in element:
            token_tag_pairs.append((element, 'JJ'))
        else:
            token_tag_pairs.append([(token.text, token.tag_) for token in nlp(element)][0])
            
    for pair in token_tag_pairs:
        # if the word is an adjective, an adverb, or a past participle of a verb, or exception like 'ground'
        if pair[1] == "JJ" or pair[1] == "RB" or pair[1] == "VBN" or pair[0] in adjective_type_exceptions:
            if pair[0] not in noun_type_exceptions:
                descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)
        
def extract_all(line):
    noun_type_exceptions = ['can', 'tablespoon', 'oz', 'clove']
    not_measurements = ['jalapeno', 'roma']
    measurement = None
    quantity_in_brackets = None
    quantity_split = []
    pre_preparation = []
    
    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
        line = line.replace(preparation, '')
        # remove 'x, ' prefix
        preparation = preparation[2:].strip()
    
    # extract backets
    brackets = extract_brackets(line)
    if brackets:
        # check the first bracket
        # if no numerical value or line_split length > 3 
        if not any(char.isdigit() for char in brackets[0]) or len(brackets[0].split()) > 3:
            pre_preparation.append(brackets[0][1:-1])
        else:
            quantity_in_brackets = brackets[0]
        # check the rest brackets if any
        if len(brackets) > 1:
            for b in brackets[1:]:
                pre_preparation.append(b[1:-1])
        for b in brackets:
            line = re.sub(r'\({0}\)'.format(b), '', line)  
        
    line_split = line.split()
    # extract quantity from the first word if the word contains a digit
    if any(char.isdigit() for char in line_split[0]):
        quantity_split.append(line_split[0])
    
        # extract quantity from the second word if the word contains a digit
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            # measurement index
            i = 2
            # check for special case
            if line_split[2] == 'oz':
                quantity_split.append('oz')
                i = 3
            # check measurement type
            if (nouns_only(line_split[i]) or line_split[i] in noun_type_exceptions) and line_split[i] not in not_measurements:
                measurement = line_split[i]
        else:
            # check line_split length and measurement type for cases like '1 egg' or '1/2 onion, chopped' or '1 large tomato, seeded and chopped'
            if len(line_split) > 2 and (nouns_only(line_split[1]) or line_split[1] in noun_type_exceptions) and line_split[1] not in not_measurements:
                measurement = line_split[1]
        line = re.sub(r'{0}'.format(' '.join(quantity_split)), '', line)
    
    if measurement:
        line = re.sub(r'{0}'.format(measurement), '', line)
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)
    
    ingredient_name = line.strip()

    # extract descriptor from ingredient_name
    descriptor = extract_descriptor(ingredient_name)

    # extract ingredient
    ingredient = ingredient_name
    if descriptor:
        for i in descriptor.split():
            ingredient = re.sub(r'[ ]?\b{0}\b'.format(i), '', ingredient).strip()
    if ingredient == '':
        ingredient = ingredient_name

    # add prepreparation to descriptor or preparation
    if pre_preparation:
        if descriptor is None:
            descriptor = ', '.join(pre_preparation)
        else:
            descriptor += ', ' + ', '.join(pre_preparation)
    
    # add 'to taste' to quantity if any
    if 'to taste' in ingredient:
        quantity_split.append('to taste')
    quantity = ' '.join(quantity_split)
    if quantity == '':
        quantity = None
    
    # remove ' to taste' in ingredient if any
    ingredient = re.sub(r'(or)? to taste', '', ingredient)
    ingredient = ' '.join(ingredient.split())
    
    # if the extracted ingredient is not noun
    if not nouns_only(ingredient):
        ingredient_name = ingredient
        if preparation:
            ingredient_name += ' ' + preparation
        ingredient_name = ingredient_name.replace(' -', ',')
        preparation = extract_preparation(ingredient_name)
        ingredient_name = re.sub(r'{0}'.format(preparation), '', ingredient_name)
        if preparation:
            preparation = preparation[2:].strip()
        descriptor = extract_descriptor(ingredient_name)
        ingredient = ingredient_name
        if descriptor:
            for i in descriptor.split():
                ingredient = re.sub(r'[ ]?\b{0}\b'.format(i), '', ingredient).strip()
            if ingredient == '':
                ingredient = ingredient_name
    
    return quantity, measurement, descriptor, ingredient, preparation

In [10]:
def decompose_ingredients(ingredients):
    for line in ingredients:
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
        print(line)
        print('\t quantity   :', quantity)
        print('\t measurement:', measurement)
        print('\t descriptor :', descriptor)
        print('\t ingredient :', ingredient)
        print('\t preparation:', preparation)
        print()

# test
decompose_ingredients(ingredients)

1 cup ricotta cheese
	 quantity   : 1
	 measurement: cup
	 descriptor : ricotta
	 ingredient : cheese
	 preparation: None

1 tablespoon butter
	 quantity   : 1
	 measurement: tablespoon
	 descriptor : None
	 ingredient : butter
	 preparation: None

2 cloves garlic, minced
	 quantity   : 2
	 measurement: cloves
	 descriptor : None
	 ingredient : garlic
	 preparation: minced

2 cups shredded Monterey Jack cheese
	 quantity   : 2
	 measurement: cups
	 descriptor : None
	 ingredient : Monterey Jack cheese
	 preparation: None

1 (10 ounce) package frozen chopped spinach , thawed, drained and squeezed dry
	 quantity   : 1 (10 ounce)
	 measurement: package
	 descriptor : frozen chopped
	 ingredient : spinach
	 preparation: thawed, drained and squeezed dry

10 (6 inch) corn tortillas
	 quantity   : 10 (6 inch)
	 measurement: corn
	 descriptor : None
	 ingredient : tortillas
	 preparation: None

1/2 cup sour cream
	 quantity   : 1/2
	 measurement: cup
	 descriptor : sour
	 ingredient : cream
	 

# Tools

In [11]:
def extract_directions_nouns(directions):
    directions_nouns = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
            # check for special cases where spaCy cannot recognize well
            if ' oven' in sentence:
                directions_nouns |= {'oven'}
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                # avoid case like 'degrees C'
                if len(pair[0]) > 1:
                    if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                        directions_nouns |= {pair[0]}
#         print('---------')
    return directions_nouns
    
directions_nouns = extract_directions_nouns(directions)
directions_nouns

{'Bake',
 'baking',
 'butter',
 'center',
 'cheese',
 'cream',
 'cup',
 'degrees',
 'dish',
 'edges',
 'enchilada',
 'garlic',
 'heat',
 'inch',
 'minutes',
 'mixture',
 'onion',
 'oven',
 'sauce',
 'saucepan',
 'seam',
 'seconds',
 'side',
 'skillet',
 'spinach',
 'time',
 'top',
 'tortilla',
 'tortillas'}

In [12]:
stemmer = PorterStemmer()

# test
stemmer.stem('Sponges')

'spong'

In [13]:
def retrieve_tools_set():
    try:
        with open('data/tools.pickle', 'rb') as file:
            tools = pickle.load(file)
#             print('loaded tools set successfully')
    except:
        url = 'https://www.mealime.com/kitchen-essentials-list'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        tools = [element.text for element in soup.find_all(class_='anchor-button')]
        # reduce each tool to its last word
        print(tools)
        tools = set([stemmer.stem(tool.split()[-1].strip()) for tool in tools])
        
        # save retrieved data
        with open('data/tools.pickle', 'wb') as file:
            pickle.dump(tools, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    return tools

retrieve_tools_set()

{'bag',
 'bin',
 'blender',
 'board',
 'bowl',
 'coland',
 'contain',
 'cup',
 'dish',
 'foil',
 'grater',
 'guard',
 'juicer',
 'knife',
 'ladl',
 'masher',
 'mitt',
 'open',
 'pan',
 'paper',
 'peeler',
 'pot',
 'press',
 'rack',
 'saucepan',
 'scale',
 'sharpen',
 'shear',
 'skillet',
 'spatula',
 'spinner',
 'spong',
 'spoon',
 'steel',
 'stockpot',
 'thermomet',
 'tong',
 'towel',
 'tray',
 'trivet',
 'whisk'}

In [14]:
def extract_tools(directions_nouns):
    tools = retrieve_tools_set()
    directions_tools = set()
    for noun in directions_nouns:
        if stemmer.stem(noun) in tools:
            directions_tools |= {noun}
    return directions_tools

extract_tools(directions_nouns)

{'cup', 'dish', 'saucepan', 'skillet'}

# Methods

In [15]:
# test
accented_string = 'sauté'
unidecode.unidecode(accented_string)

'saute'

In [16]:
def retrieve_cooking_methods_set():
    try:
        with open('data/cooking_methods.pickle', 'rb') as file:
            cooking_methods = pickle.load(file)
#             print('loaded cooking_methods set successfully')
    except:
        url = 'https://www.thedailymeal.com/cook/15-basic-cooking-methods-you-need-know-slideshow/slide-13'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        cooking_methods = [element.h2.text for element in soup.find_all(class_='image-title slide-title')]
        cooking_methods = set([stemmer.stem(unidecode.unidecode(method.strip())) for method in cooking_methods])
        
        # save retrieved data
        with open('data/cooking_methods.pickle', 'wb') as file:
            pickle.dump(cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return cooking_methods

# test
methods = retrieve_cooking_methods_set()
methods

{'bake',
 'blanch',
 'boil',
 'brais',
 'broil',
 'deep-fri',
 'grill',
 'pan-fri',
 'poach',
 'roast',
 'saut',
 'sear',
 'simmer',
 'steam',
 'stew'}

In [17]:
def retrieve_other_cooking_methods_set():
    try:
        with open('data/other_cooking_methods.pickle', 'rb') as file:
            other_cooking_methods = pickle.load(file)
#             print('loaded other_cooking_methods set successfully')
    except:
        url = 'https://en.wikibooks.org/wiki/Cookbook:Cooking_Techniques'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        unwanted = ['Contents', '[', 'edit', ']', '\n']
        other_cooking_methods = set()
        dump = soup.find_all(class_='mw-parser-output')
        for i in dump:
            for j in i.contents:
                if hasattr(j, 'contents'):
                    for k in j.contents:
                        if hasattr(k, 'contents'):
                            for l in k.contents:
                                if hasattr(l, 'contents'):
                                    for method in l:
    #                                     print(method.string)
                                        if method.string is not None and method not in unwanted:
                                            other_cooking_methods |= {stemmer.stem(method.string.split()[-1])}

        # remove uncessary methods after complexity reduction
        other_cooking_methods.remove('cook')
        other_cooking_methods.remove('chocol')
        
        # save retrieved data
        with open('data/other_cooking_methods.pickle', 'wb') as file:
            pickle.dump(other_cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return other_cooking_methods

# test
other_methods = retrieve_other_cooking_methods_set()
other_methods

{'bain-mari',
 'bake',
 'barbecu',
 'bast',
 'blanch',
 'boil',
 'bone',
 'brine',
 'can',
 'caramel',
 'chiffonad',
 'chop',
 'cockaign',
 'cream',
 'cube',
 'deglaz',
 'degorg',
 'dredg',
 'dri',
 'ferment',
 'fri',
 'grill',
 'julien',
 'marin',
 'minc',
 'pan-fri',
 'pickl',
 'poach',
 'roast',
 'rub',
 'sauté',
 'scald',
 'shir',
 'simmer',
 'skill',
 'slice',
 'smoke',
 'sous-vid',
 'steam',
 'stew',
 'stir-fri',
 'storag',
 'temper',
 'test'}

In [18]:
def extract_directions_verbs(directions):
    directions_verbs = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                if pair[1] == 'VB':
                    directions_verbs |= {pair[0].lower()}
#         print('---------')
    return directions_verbs

directions_verbs = extract_directions_verbs(directions)
directions_verbs

{'add',
 'cook',
 'mix',
 'place',
 'pour',
 'preheat',
 'remove',
 'roll',
 'spoon',
 'sprinkle',
 'stir'}

In [19]:
def extract_methods(directions_verbs):
    methods = retrieve_cooking_methods_set()
    methods |= retrieve_other_cooking_methods_set()
    directions_methods = set()
    for verb in directions_verbs:
        if stemmer.stem(verb) in methods:
            directions_methods |= {verb}
    return directions_methods

extract_methods(directions_verbs)

set()

# Steps

In [20]:
def extract_directions_ingredients(ingredients):
    ingredients_nouns = set()
    for line in ingredients:
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
        ingredients_nouns |= {ingredient}
        # for better granularity, in case full name is not mentioned
        token_tag_pairs = tokenize(ingredient)
        for pair in token_tag_pairs:    
            if len(pair[0]) > 1:
                if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                    ingredients_nouns |= {pair[0]}
    # start from the longest
    return sorted((list(ingredients_nouns)), key=len)[::-1]

extract_directions_ingredients(ingredients)

['Monterey Jack cheese',
 'tortillas',
 'spinach',
 'garlic',
 'butter',
 'onions',
 'cheese',
 'cream',
 'sauce']

In [21]:
def extract_ingredients(direction):
    ingredients_set = extract_directions_ingredients(ingredients)
    direction_ingredients = set()
    used = set()
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        for i in ingredients_set:
            if i in sentence and i not in used:
                direction_ingredients |= {i}
                # store used partial word in used
                for word in i.split():
                    used |= {word}
    return direction_ingredients

In [22]:
def extract_direction_time(direction):
    times = []
    for sentence in sent_tokenize(direction):
        match = re.findall(re.compile(r'for .* minute[s]?\b|\d+ minute[s]?\b'), sentence)
        if len(match) != 0:
            for m in match:
                times.append(m.replace('for ', ''))
    if len(times) == 0:
        return None
    return ' + '.join(times)

# test
d = 'Add garlic to the onions and cook an additional 1 minute. Add chicken soup base, water, and potatoes, simmer 15 minutes.'
# d = 'Melt butter in a saucepan over medium heat. Add garlic and onion; cook for a few minutes until fragrant, but not brown. Stir in spinach, and cook for about 5 more minutes. Remove from the heat, and mix in ricotta cheese, sour cream, and 1 cup of Monterey Jack cheese.'
extract_direction_time(d)

'1 minute + 15 minutes'

In [23]:
def decompose_steps():
    prep_time, cook_time = extract_time(url)
    if len(directions) > 1:
            average_cook_time_per_step = round(convert_to_minutes(cook_time) / (len(directions) - 1))

    for i, direction in enumerate(directions):
        print('Step:', i+1)
        print('Direction:', direction)
        if i == 0:
            print('\tprep time:', prep_time)
        else:
            if extract_direction_time(direction):
                print('\testimated cook time: ' + extract_direction_time(direction))
            else:
                print('\testimated cook time: {0} minutes'.format(average_cook_time_per_step))

        single_direction_tools = extract_tools(extract_directions_nouns(direction))
        single_direction_methods = extract_methods(extract_directions_verbs(direction))
        single_direction_ingredients = extract_ingredients(direction)

        if len(single_direction_tools) > 0:
            print('\ttools:', ', '.join(single_direction_tools))
        if len(single_direction_methods) > 0:
            print('\tmethods:', ', '.join(single_direction_methods))
        if len(single_direction_ingredients) > 0:
            print('\tingredients:', ' '.join(single_direction_ingredients))
        print('---------')

decompose_steps()

Step: 1
Direction: Preheat the oven to 375 degrees F (190 degrees C).
	prep time: 20 m
---------
Step: 2
Direction: Melt butter in a saucepan over medium heat. Add garlic and onion; cook for a few minutes until fragrant, but not brown. Stir in spinach, and cook for about 5 more minutes. Remove from the heat, and mix in ricotta cheese, sour cream, and 1 cup of Monterey Jack cheese.
	estimated cook time: a few minutes + about 5 more minutes
	tools: saucepan, cup
	ingredients: Monterey Jack cheese sauce cream butter spinach garlic
---------
Step: 3
Direction: In a skillet over medium heat, warm tortillas one at a time until flexible, about 15 seconds. Spoon about 1/4 cup of the spinach mixture onto the center of each tortilla. Roll up, and place seam side down in a 9x13 inch baking dish. Pour enchilada sauce over the top, and sprinkle with the remaining cup of Monterey Jack.
	estimated cook time: 7 minutes
	tools: skillet, cup, dish
	ingredients: sauce spinach tortillas
---------
Step: 4
