In [1]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'


In [36]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [63]:
ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
ingredients

{'',
 '1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles',
 'Add all ingredients to list'}

In [64]:
ingredients.remove('')
ingredients.remove('Add all ingredients to list')

In [65]:
ingredients

{'1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles'}

In [76]:
import re

In [224]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length
    pattern = re.compile(r'\([\w\s]*\)') 
    match = re.findall(pattern, line)
    return match

In [279]:
def extract_quantity(line):
    quantity = []
    quantity_in_brackets = None
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub('\(' + quantity_in_brackets[0] + '\)', '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    # extract quantity from the first word
    line_split = line.split()
    quantity.append(line_split[0])
    
    # extract quantity from the second word if the word contains a digit
    if any(char.isdigit() for char in line_split[1]):
        quantity.append(line_split[1])
        quantity.append(line_split[2])
    else:
        quantity.append(line_split[1])
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity.append(quantity_in_brackets)

    return ' '.join(quantity)

In [307]:
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r', [\w\s]*') 
    match = re.findall(pattern, line)
    return match

In [308]:
import spacy
# python3 -m spacy download en
nlp = spacy.load('en')

def identify_entities(text):
    tags = {}
    for ent in nlp(text).ents:
        entity = ent.text.strip()
        if entity not in tags and len(entity) > 1:
            # remove stopword 'the'
            if len(entity.split()) == 1 and entity.lower() == 'the':
                pass
            else:
                tags[entity]=[ent.label_]
    return tags

In [309]:
import nltk
# nltk.download('averaged_perceptron_tagger')

In [310]:
def tokenize(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [311]:
tokenize('2 (10 ounce) packages frozen chopped spinach')

[('2', 'CD'),
 ('(', '('),
 ('10', 'CD'),
 ('ounce', 'NN'),
 (')', ')'),
 ('packages', 'VBZ'),
 ('frozen', 'JJ'),
 ('chopped', 'VBD'),
 ('spinach', 'NN')]

In [312]:
for element in ingredients:
    print(element)
    print('quantity:', extract_quantity(element))
    print(identify_entities(element))
    print('preparation:', extract_preparation(element))
    print()

1 tablespoon extra virgin olive oil
quantity: 1 tablespoon
{}
[]

8 ounces lasagna noodles
quantity: 8 ounces
{'8 ounces': ['QUANTITY']}
[]

2 cloves garlic, crushed
quantity: 2 cloves
{}
[', crushed']

1/2 teaspoon dried oregano
quantity: 1/2 teaspoon
{'1/2': ['CARDINAL']}
[]

1/2 teaspoon dried basil
quantity: 1/2 teaspoon
{'1/2': ['CARDINAL']}
[]

1 teaspoon salt
quantity: 1 teaspoon
{}
[]

1 egg
quantity: 1 egg
{}
[]

1/2 onion, chopped
quantity: 1/2 onion,
{'1/2': ['CARDINAL']}
[', chopped']

1 (32 ounce) jar spaghetti sauce
quantity: 1 jar (32 ounce)
{'32': ['CARDINAL']}
[]

1 1/2 cups water
quantity: 1 1/2 cups
{'1 1/2': ['CARDINAL']}
[]

2 cups non-fat cottage cheese
quantity: 2 cups
{}
[]

1 (8 ounce) package part skim mozzarella cheese, shredded
quantity: 1 package (8 ounce)
{'8 ounce': ['QUANTITY']}
[', shredded']

1/4 cup grated Parmesan cheese
quantity: 1/4 cup
{'1/4': ['CARDINAL'], 'Parmesan': ['ORG']}
[]

1/2 cup chopped fresh parsley
quantity: 1/2 cup
{}
[]

1/8 teaspoo

In [67]:
def extract_ingredient_names(ingredients):
    return [element.split()[-1] for element in ingredients]

extract_ingredient_names(ingredients)

['oil',
 'noodles',
 'crushed',
 'oregano',
 'basil',
 'salt',
 'egg',
 'chopped',
 'sauce',
 'water',
 'cheese',
 'shredded',
 'cheese',
 'parsley',
 'pepper',
 'spinach']