# Ingredients

In [1]:
import re
import pickle

import nltk
# nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem import PorterStemmer
import spacy
import requests
import unidecode
from bs4 import BeautifulSoup

In [2]:
# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [3]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'
# url = 'https://www.allrecipes.com/recipe/246141/pad-thai-with-tofu/'
# url = 'https://www.allrecipes.com/recipe/221286/traditional-mexican-guacamole'
# url = 'https://www.allrecipes.com/recipe/245863/chicken-stuffed-baked-avocados'
url = 'https://www.allrecipes.com/recipe/228240/bibimbap-korean-rice-with-mixed-vegetables/'

In [4]:
def extract_time(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    times = set([element.text.strip() for element in soup.find_all(class_='prepTime__item')])
    # remove uncessary elements
    times.remove('')
    prep_time = None
    cook_time = None
    for time in times:
        if 'prep' in time.lower():
            prep_time = time[4:]
        if 'cook' in time.lower():
            cook_time = time[4:]
    return prep_time, cook_time

# test
prep_time, cook_time = extract_time(url)
print(prep_time)
print(cook_time)

30 m
30 m


In [5]:
def convert_to_minutes(cook_time):
    if cook_time is None:
        return 0
    if 'h' in cook_time:
        hour_index = cook_time.index('h')
        print(hour_index)
        hours = int(cook_time[:hour_index].strip())
        if 'm' in cook_time:
            minutes = int(cook_time[hour_index+1 : -1].strip())
        else:
            minutes = 0
    else:
        hours = 0
        minutes = int(cook_time[: -1].strip())
    return 60*hours + minutes

# test
convert_to_minutes(cook_time)

30

In [6]:
def get_recipe_name(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup.find_all("h1", {"class": "recipe-summary__h1"})[0].text

# test
get_recipe_name(url)

'Bibimbap (Korean Rice With Mixed Vegetables)'

In [7]:
def get_ingredient_list_and_directions(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # extract ingredients section from the webpage
    ingredients = [element.label.text.strip() for element in soup.find_all(class_='checkList__line')]
    
    # remove exceptions like 'topping:'
    for i in ingredients:
        if ':' in i:
            ingredients.remove(i)
    ingredients = set(ingredients)
    
    # remove unnecessary elements
    unnecessary = ['', 'Add all ingredients to list']
    for i in unnecessary:
        if i in ingredients:
            ingredients.remove(i)
            
    # extract directions section from the webpage
    directions = [element.text.strip() for element in soup.find_all(class_='recipe-directions__list--item')]
    # remove unnecessary elements
    directions.remove('')
    return ingredients, directions

# test
ingredients, directions = get_ingredient_list_and_directions(url)
print(directions)
ingredients

['Stir cucumber pieces with 1/4 cup gochujang paste in a bowl; set aside.', 'Bring about 2 cups of water to a boil in a large nonstick skillet and stir in spinach; cook until spinach is bright green and wilted, 2 to 3 minutes. Drain spinach and squeeze out as much moisture as possible; set spinach aside in a bowl and stir soy sauce into spinach.', 'Heat 1 teaspoon olive oil in large nonstick skillet and cook and stir carrots until softened, about 3 minutes; stir in garlic and cook just until fragrant, about 1 more minute. Stir in cucumber pieces with gochujang paste; sprinkle with red pepper flakes, and set the mixture aside in a bowl.', 'Brown beef in a clean nonstick skillet over medium heat, about 5 minutes per side, and set aside. In a separate nonstick skillet, heat 1 more teaspoon olive oil over medium-low heat and fry the eggs just on one side until yolks are runny but whites are firm, 2 to 4 minutes each.', 'To assemble the dish, divide cooked rice into 4 large serving bowls; t

{'1 English cucumber, cut into matchsticks',
 '1 bunch fresh spinach, cut into thin strips',
 '1 clove garlic, minced',
 '1 pinch red pepper flakes',
 '1 pound thinly-sliced beef top round steak',
 '1 tablespoon soy sauce',
 '1 teaspoon olive oil',
 '1 teaspoon sesame seeds',
 '1/4 cup gochujang (Korean hot pepper paste)',
 '2 carrots, cut into matchsticks',
 '2 teaspoons gochujang (Korean hot pepper paste), divided (optional)',
 '4 cups cooked white rice',
 '4 large eggs',
 '4 teaspoons toasted sesame oil, divided'}

In [8]:
def numerical(line):
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not numerical
        if not pair[1] == "CD":
            return False
    return True

def nouns_only(line):
    noun_types = ['NN', 'NNS', 'NNP', 'NNPS']
    adjective_type_exceptions = ['ground', 'skinless', 'boneless']
    noun_type_exceptions = ['parsley', 'garlic', 'chili', 'chile', 'substitute', 'cream', 'flanken', 'cilantro', 'such']
    # replace everything to '' except whitespace, alphanumeric character
    line = re.sub(r'[^\w\s]', '', line)
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        # if the word is not a noun or cardinal number
        if (not (pair[1] in noun_types) or pair[0] in adjective_type_exceptions) and pair[0] not in noun_type_exceptions:
            return False
    return True

In [25]:
    return quantity, measurement, descriptor, ingredient, preparation

In [26]:
def decompose_ingredients(ingredients):
    for line in ingredients:
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
        print(line)
        print('\t quantity   :', quantity)
        print('\t measurement:', measurement)
        print('\t descriptor :', descriptor)
        print('\t ingredient :', ingredient)
        print('\t preparation:', preparation)
        print()

# test
decompose_ingredients(ingredients)

1 clove garlic, minced
	 quantity   : 1
	 measurement: clove
	 descriptor : None
	 ingredient : garlic
	 preparation: minced

2 teaspoons gochujang (Korean hot pepper paste), divided (optional)
	 quantity   : 2
	 measurement: teaspoons
	 descriptor : gochujang, Korean hot pepper paste
	 ingredient : )
	 preparation: divided (optional

2 carrots, cut into matchsticks
	 quantity   : 2
	 measurement: None
	 descriptor : None
	 ingredient : carrots
	 preparation: cut into matchsticks

1 tablespoon soy sauce
	 quantity   : 1
	 measurement: tablespoon
	 descriptor : None
	 ingredient : soy sauce
	 preparation: None

1 bunch fresh spinach, cut into thin strips
	 quantity   : 1
	 measurement: bunch
	 descriptor : fresh
	 ingredient : spinach
	 preparation: cut into thin strips

1 English cucumber, cut into matchsticks
	 quantity   : 1
	 measurement: None
	 descriptor : English
	 ingredient : cucumber
	 preparation: cut into matchsticks

1 teaspoon sesame seeds
	 quantity   : 1
	 measurement: t

# Tools

In [11]:
def extract_directions_nouns(directions):
    directions_nouns = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
            # check for special cases where spaCy cannot recognize well
            if ' oven' in sentence:
                directions_nouns |= {'oven'}
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                # avoid case like 'degrees C'
                if len(pair[0]) > 1:
                    if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                        directions_nouns |= {pair[0]}
#         print('---------')
    return directions_nouns
    
directions_nouns = extract_directions_nouns(directions)
directions_nouns

{'Heat',
 'Place',
 'amount',
 'beef',
 'boil',
 'bowl',
 'bowls',
 'carrots',
 'cucumber',
 'cup',
 'cups',
 'dish',
 'egg',
 'eggs',
 'flakes',
 'garlic',
 'gochujang',
 'heat',
 'minute',
 'minutes',
 'mixture',
 'moisture',
 'nonstick',
 'oil',
 'olive',
 'paste',
 'pepper',
 'pieces',
 'rice',
 'sauce',
 'seeds',
 'sesame',
 'side',
 'skillet',
 'soy',
 'spinach',
 'sprinkle',
 'teaspoon',
 'water',
 'whites',
 'yolks'}

In [12]:
stemmer = PorterStemmer()

# test
stemmer.stem('Sponges')

'spong'

In [13]:
def retrieve_tools_set():
    try:
        with open('data/tools.pickle', 'rb') as file:
            tools = pickle.load(file)
#             print('loaded tools set successfully')
    except:
        url = 'https://www.mealime.com/kitchen-essentials-list'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        tools = [element.text for element in soup.find_all(class_='anchor-button')]
        # reduce each tool to its last word
        print(tools)
        tools = set([stemmer.stem(tool.split()[-1].strip()) for tool in tools])
        
        # save retrieved data
        with open('data/tools.pickle', 'wb') as file:
            pickle.dump(tools, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    return tools

retrieve_tools_set()

{'bag',
 'bin',
 'blender',
 'board',
 'bowl',
 'coland',
 'contain',
 'cup',
 'dish',
 'foil',
 'grater',
 'guard',
 'juicer',
 'knife',
 'ladl',
 'masher',
 'mitt',
 'open',
 'pan',
 'paper',
 'peeler',
 'pot',
 'press',
 'rack',
 'saucepan',
 'scale',
 'sharpen',
 'shear',
 'skillet',
 'spatula',
 'spinner',
 'spong',
 'spoon',
 'steel',
 'stockpot',
 'thermomet',
 'tong',
 'towel',
 'tray',
 'trivet',
 'whisk'}

In [14]:
def extract_tools(directions_nouns):
    tools = retrieve_tools_set()
    directions_tools = set()
    for noun in directions_nouns:
        if stemmer.stem(noun) in tools:
            directions_tools |= {noun}
    return directions_tools

extract_tools(directions_nouns)

{'bowl', 'bowls', 'cup', 'cups', 'dish', 'skillet'}

# Methods

In [15]:
# test
accented_string = 'sauté'
unidecode.unidecode(accented_string)

'saute'

In [16]:
def retrieve_cooking_methods_set():
    try:
        with open('data/cooking_methods.pickle', 'rb') as file:
            cooking_methods = pickle.load(file)
#             print('loaded cooking_methods set successfully')
    except:
        url = 'https://www.thedailymeal.com/cook/15-basic-cooking-methods-you-need-know-slideshow/slide-13'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        cooking_methods = [element.h2.text for element in soup.find_all(class_='image-title slide-title')]
        cooking_methods = set([stemmer.stem(unidecode.unidecode(method.strip())) for method in cooking_methods])
        
        # save retrieved data
        with open('data/cooking_methods.pickle', 'wb') as file:
            pickle.dump(cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return cooking_methods

# test
methods = retrieve_cooking_methods_set()
methods

{'bake',
 'blanch',
 'boil',
 'brais',
 'broil',
 'deep-fri',
 'grill',
 'pan-fri',
 'poach',
 'roast',
 'saut',
 'sear',
 'simmer',
 'steam',
 'stew'}

In [17]:
def retrieve_other_cooking_methods_set():
    try:
        with open('data/other_cooking_methods.pickle', 'rb') as file:
            other_cooking_methods = pickle.load(file)
#             print('loaded other_cooking_methods set successfully')
    except:
        url = 'https://en.wikibooks.org/wiki/Cookbook:Cooking_Techniques'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        unwanted = ['Contents', '[', 'edit', ']', '\n']
        other_cooking_methods = set()
        dump = soup.find_all(class_='mw-parser-output')
        for i in dump:
            for j in i.contents:
                if hasattr(j, 'contents'):
                    for k in j.contents:
                        if hasattr(k, 'contents'):
                            for l in k.contents:
                                if hasattr(l, 'contents'):
                                    for method in l:
    #                                     print(method.string)
                                        if method.string is not None and method not in unwanted:
                                            other_cooking_methods |= {stemmer.stem(method.string.split()[-1])}

        # remove uncessary methods after complexity reduction
        other_cooking_methods.remove('cook')
        other_cooking_methods.remove('chocol')
        
        # save retrieved data
        with open('data/other_cooking_methods.pickle', 'wb') as file:
            pickle.dump(other_cooking_methods, file, protocol=pickle.HIGHEST_PROTOCOL)
            
    return other_cooking_methods

# test
other_methods = retrieve_other_cooking_methods_set()
other_methods

{'bain-mari',
 'bake',
 'barbecu',
 'bast',
 'blanch',
 'boil',
 'bone',
 'brine',
 'can',
 'caramel',
 'chiffonad',
 'chop',
 'cockaign',
 'cream',
 'cube',
 'deglaz',
 'degorg',
 'dredg',
 'dri',
 'ferment',
 'fri',
 'grill',
 'julien',
 'marin',
 'minc',
 'pan-fri',
 'pickl',
 'poach',
 'roast',
 'rub',
 'sauté',
 'scald',
 'shir',
 'simmer',
 'skill',
 'slice',
 'smoke',
 'sous-vid',
 'steam',
 'stew',
 'stir-fri',
 'storag',
 'temper',
 'test'}

In [18]:
def extract_directions_verbs(directions):
    directions_verbs = set()
    if isinstance(directions, str):
        directions = [directions]
    for direction in directions:
        sentences = sent_tokenize(direction)
        for sentence in sentences:
#             print(sentence)
            token_tag_pairs = tokenize(sentence)
            for pair in token_tag_pairs:    
                if pair[1] == 'VB':
                    directions_verbs |= {pair[0].lower()}
#         print('---------')
    return directions_verbs

directions_verbs = extract_directions_verbs(directions)
directions_verbs

{'assemble',
 'bring',
 'cook',
 'divide',
 'drain',
 'drizzle',
 'fry',
 'set',
 'sprinkle',
 'squeeze',
 'stir',
 'top'}

In [19]:
def extract_methods(directions_verbs):
    methods = retrieve_cooking_methods_set()
    methods |= retrieve_other_cooking_methods_set()
    directions_methods = set()
    for verb in directions_verbs:
        if stemmer.stem(verb) in methods:
            directions_methods |= {verb}
    return directions_methods

extract_methods(directions_verbs)

{'fry'}

# Steps

In [20]:
def extract_directions_ingredients(ingredients):
    ingredients_nouns = set()
    for line in ingredients:
        quantity, measurement, descriptor, ingredient, preparation = extract_all(line)
        ingredients_nouns |= {ingredient}
        # for better granularity, in case full name is not mentioned
        token_tag_pairs = tokenize(ingredient)
        for pair in token_tag_pairs:    
            if len(pair[0]) > 1:
                if (pair[1] == 'NN' or pair[1] == 'NNS') and pair[0] != 'ground':
                    ingredients_nouns |= {pair[0]}
    # start from the longest
    return sorted((list(ingredients_nouns)), key=len)[::-1]

extract_directions_ingredients(ingredients)

toasted sesame oil
gochujang


['toasted sesame oil',
 'pepper flakes',
 'sesame seeds',
 'beef steak',
 'gochujang',
 'soy sauce',
 'olive oil',
 'cucumber',
 'carrots',
 'spinach',
 'pepper',
 'sesame',
 'flakes',
 'garlic',
 'olive',
 'steak',
 'seeds',
 'sauce',
 'beef',
 'eggs',
 'rice',
 'oil',
 'soy',
 ')']

In [21]:
def extract_ingredients(direction):
    ingredients_set = extract_directions_ingredients(ingredients)
    direction_ingredients = set()
    used = set()
    sentences = sent_tokenize(direction)
    for sentence in sentences:
        for i in ingredients_set:
            if i in sentence and i not in used:
                direction_ingredients |= {i}
                # store used partial word in used
                for word in i.split():
                    used |= {word}
    return direction_ingredients

In [22]:
def extract_maximum_minutes(time):
    match = re.findall(re.compile(r'[1-9]*[0-9]'), time)
    if 'minute' in time:
        return int(match[-1])
    if 'second' in time:
        return round(int(match[-1])/60, 2)
    return 0
     
# test
print(extract_maximum_minutes('21 minutes'))
print(extract_maximum_minutes('2 or 4 seconds'))

21
0.07


In [35]:
def extract_direction_time(direction):
    total_time = 0
    times = []
    for sentence in sent_tokenize(direction):
        match = re.findall(re.compile(r'\d+[^,.;]*second[s]?\b|\d+[^,.;]*minute[s]?\b'), sentence)
        if len(match) != 0:
            for m in match:
                times.append(m)
    
    if len(times) == 0:
        return None
    
    if len(times) == 1:
        if 'minute' in times[-1]:
            return str(extract_maximum_minutes(times[-1])) + ' minutes'
        return times[-1].replace('more ', '')
    
    if len(times) > 1:
        for time in times:
            total_time += extract_maximum_minutes(time)
        return str(total_time) + ' minutes'
    
# test
# d = 'Add garlic to the onions and cook an additional 1 minute. Add chicken soup base, water, and potatoes, \
#     simmer 15 minutes.'
d = 'Melt butter in a saucepan over medium heat. Add garlic and onion; cook for a few minutes until fragrant, \
    but not brown. Stir in spinach, and cook for about 5 more minutes. Remove from the heat, and mix in ricotta cheese, \
    sour cream, and 1 cup of Monterey Jack cheese.'
d = 'Bring about 2 cups of water to a boil in a large nonstick skillet and stir in spinach; cook until spinach \
is bright green and wilted, 2 to 3 minutes. Drain spinach and squeeze out as much moisture as possible;\
set spinach aside in a bowl and stir soy sauce into spinach.'

extract_direction_time(d)

'3 minutes'

In [36]:
def decompose_steps():
    prep_time, cook_time = extract_time(url)
    if len(directions) > 1:
            average_cook_time_per_step = round(convert_to_minutes(cook_time) / (len(directions) - 1))

    for i, direction in enumerate(directions):
        print('Step:', i+1)
        print('Direction:', direction)
        if i == 0:
            print('\tprep time:', prep_time)
        else:
            if extract_direction_time(direction):
                print('\testimated cook time: about {0}'.format(extract_direction_time(direction)))
            else:
                print('\testimated cook time: {0} minutes'.format(average_cook_time_per_step))

        single_direction_tools = extract_tools(extract_directions_nouns(direction))
        single_direction_methods = extract_methods(extract_directions_verbs(direction))
        single_direction_ingredients = extract_ingredients(direction)

        if len(single_direction_tools) > 0:
            print('\ttools:', ', '.join(single_direction_tools))
        if len(single_direction_methods) > 0:
            print('\tmethods:', ', '.join(single_direction_methods))
        if len(single_direction_ingredients) > 0:
            print('\tingredients:', ' '.join(single_direction_ingredients))
        print('---------')

decompose_steps()

Step: 1
Direction: Stir cucumber pieces with 1/4 cup gochujang paste in a bowl; set aside.
	prep time: 30 m
	tools: bowl, cup
	ingredients: gochujang cucumber
---------
Step: 2
Direction: Bring about 2 cups of water to a boil in a large nonstick skillet and stir in spinach; cook until spinach is bright green and wilted, 2 to 3 minutes. Drain spinach and squeeze out as much moisture as possible; set spinach aside in a bowl and stir soy sauce into spinach.
	estimated cook time: about 3 minutes
	tools: bowl, cups, skillet
	ingredients: soy sauce oil spinach
---------
Step: 3
Direction: Heat 1 teaspoon olive oil in large nonstick skillet and cook and stir carrots until softened, about 3 minutes; stir in garlic and cook just until fragrant, about 1 more minute. Stir in cucumber pieces with gochujang paste; sprinkle with red pepper flakes, and set the mixture aside in a bowl.
	estimated cook time: about 4 minutes
	tools: bowl, skillet
	ingredients: gochujang cucumber garlic pepper flakes oli