In [1]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'https://www.allrecipes.com/recipe/23988/simple-spinach-lasagna/?internalSource=streams&referringId=87&referringContentType=Recipe%20Hub&clickId=st_trending_s'

# test
# url = 'https://www.allrecipes.com/recipe/235874/copycat-panera-broccoli-cheddar-soup/?clickId=right%20rail1&internalSource=rr_feed_recipe_sb&referringId=23988%20referringContentType%3Drecipe'

In [36]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [63]:
# extract ingredients section from the webpage
ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])
ingredients

{'',
 '1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles',
 'Add all ingredients to list'}

In [64]:
# remove unnecessary elements
ingredients.remove('')
ingredients.remove('Add all ingredients to list')

In [65]:
ingredients

{'1 (32 ounce) jar spaghetti sauce',
 '1 (8 ounce) package part skim mozzarella cheese, shredded',
 '1 1/2 cups water',
 '1 egg',
 '1 tablespoon extra virgin olive oil',
 '1 teaspoon salt',
 '1/2 cup chopped fresh parsley',
 '1/2 onion, chopped',
 '1/2 teaspoon dried basil',
 '1/2 teaspoon dried oregano',
 '1/4 cup grated Parmesan cheese',
 '1/8 teaspoon black pepper',
 '2 (10 ounce) packages frozen chopped spinach',
 '2 cloves garlic, crushed',
 '2 cups non-fat cottage cheese',
 '8 ounces lasagna noodles'}

In [76]:
import re

In [507]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length
    pattern = re.compile(r'\([\w\s]*\)') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match

In [508]:
def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r', [\w\s]*') 
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match

In [545]:
def extract_quantity_measurement_preparation(line):
    quantity_split = []
    measurement = None
    
    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:]  
    
    # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]
    
    # extract quantity from the first word
    line_split = line.split()
    quantity_split.append(line_split[0])
    
    # extract quantity from the second word if the word contains a digit
    if any(char.isdigit() for char in line_split[1]):
        quantity_split.append(line_split[1])
        measurement = line_split[2]
    else:
        # to adjust for case like '1 egg' or '1/2 onion, chopped'
        if len(line_split) > 2:
            measurement = line_split[1]
    
    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)

    return ' '.join(quantity_split), measurement, preparation

In [547]:
def extract_ingredient_name(line):
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    if measurement is None:
        measurement = ''
        
    if preparation:
        return line[len(quantity + ' ' + measurement) : -(len(preparation) + 2)].strip()
    else:
        return line[len(quantity + ' ' + measurement):].strip()

In [548]:
extract_ingredient_name('1/2 onion, chopped')

'onion'

In [558]:
import spacy

# python3 -m spacy download en
nlp = spacy.load('en')

def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

In [532]:
tokenize('1 tablespoon extra virgin olive oil')

[('1', 'CD'),
 ('tablespoon', 'VBP'),
 ('extra', 'JJ'),
 ('virgin', 'JJ'),
 ('olive', 'NN'),
 ('oil', 'NN')]

In [573]:
spacy.explain('VBP')

'verb, non-3rd person singular present'

In [559]:
import nltk
# nltk.download('averaged_perceptron_tagger')

def tokenize_nltk(line):
    tokens = nltk.word_tokenize(line)
    token_tag_pairs = nltk.pos_tag(tokens)
    return token_tag_pairs

In [560]:
tokenize('1 tablespoon extra virgin olive oil')

[('1', 'CD'),
 ('tablespoon', 'VBP'),
 ('extra', 'JJ'),
 ('virgin', 'JJ'),
 ('olive', 'NN'),
 ('oil', 'NN')]

In [572]:
def extract_descriptor(ingredient_name):
    descriptor = []
    token_tag_pairs = tokenize(ingredient_name)
    for pair in token_tag_pairs:
        if pair[1] == "JJ" or pair[1] == "VBN":
            descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)

In [529]:
def check_noun_num(token_tag_pairs):
    tag_num_dict = {}
    for pair in token_tag_pairs:
        if pair[1] not in tag_num_dict:
            tag_num_dict[pair[1]] = 1
        else:
            tag_num_dict[pair[1]] += 1
    
    criterion_1 = 'NN' in tag_num_dict and tag_num_dict['NN'] >= 2
    criterion_2 = 'NNS' in tag_num_dict and tag_num_dict['NNS'] >= 2
    criterion_3 = 'NN' in tag_num_dict and 'NNS' in tag_num_dict and tag_num_dict['NN'] + tag_num_dict['NNS'] >= 2
    
    if criterion_1 or criterion_2 or criterion_3:
        return True
    else:
        return False

In [574]:
for line in ingredients:
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    ingredient_name = extract_ingredient_name(line)
    descriptor = extract_descriptor(ingredient_name)
    
    print(line)
    print('ingredient name:', ingredient_name)
    print('descriptor:', descriptor)
    print('quantity:', quantity)
    print('measurement:', measurement)
    print('preparation:', preparation)
    print()
    
#     token_tag_pairs = tokenize(line)
#     print(token_tag_pairs)
#     print(check_noun_num(token_tag_pairs))
#     print()

1 tablespoon extra virgin olive oil
ingredient name: extra virgin olive oil
descriptor: extra virgin
quantity: 1
measurement: tablespoon
preparation: None

8 ounces lasagna noodles
ingredient name: lasagna noodles
descriptor: None
quantity: 8
measurement: ounces
preparation: None

2 cloves garlic, crushed
ingredient name: garlic
descriptor: garlic
quantity: 2
measurement: cloves
preparation: crushed

1/2 teaspoon dried oregano
ingredient name: dried oregano
descriptor: dried
quantity: 1/2
measurement: teaspoon
preparation: None

1/2 teaspoon dried basil
ingredient name: dried basil
descriptor: dried
quantity: 1/2
measurement: teaspoon
preparation: None

1 teaspoon salt
ingredient name: salt
descriptor: None
quantity: 1
measurement: teaspoon
preparation: None

1 egg
ingredient name: egg
descriptor: None
quantity: 1
measurement: None
preparation: None

1/2 onion, chopped
ingredient name: onion
descriptor: None
quantity: 1/2
measurement: None
preparation: chopped

1 (32 ounce) jar spaghet