In [1]:
import requests
from bs4 import BeautifulSoup
import re
import spacy
import json
import time
import urllib3
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()
http = urllib3.PoolManager()

headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
headers['Accept'] = 'application/json, text/javascript'
# python3 -m spacy download en
nlp = spacy.load('en')

In [2]:
def tokenize(line):
    return [(token.text, token.tag_) for token in nlp(line)]

import nltk
# nltk.download('averaged_perceptron_tagger')

def tokenize_nltk(line):
    tokens = nltk.word_tokenize(line)
    token_tag_pairs = nltk.pos_tag(tokens)
    return token_tag_pairs

In [3]:
def extract_quantity_in_backets(line):
    # find '(abc)' where 'abc' is in arbitrary length
    pattern = re.compile(r'\([\w\s]*\)')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match

def extract_preparation(line):
    # find ', abc' where 'abc' is in arbitrary length
    pattern = re.compile(r', [\w\s]*')
    match = re.findall(pattern, line)
    if len(match) != 0:
        return match


def extract_quantity_measurement_preparation(line):
    quantity_split = []
    measurement = None

    # extract preparation
    preparation = extract_preparation(line)
    if preparation:
        line = re.sub(r'{0}'.format(preparation[0]), '', line)
        # remove ', ' prefix
        preparation = preparation[0][2:]

        # extract quantity in backets
    quantity_in_brackets = extract_quantity_in_backets(line)
    if quantity_in_brackets:
        line = re.sub(r'\({0}\)'.format(quantity_in_brackets[0]), '', line)
        quantity_in_brackets = quantity_in_brackets[0]

    # extract quantity from the first word
    line_split = line.split()
    quantity_split.append(line_split[0])

    # extract quantity from the second word if the word contains a digit
    if len(line_split) > 1:
        if any(char.isdigit() for char in line_split[1]):
            quantity_split.append(line_split[1])
            measurement = line_split[2]
        else:
            # to adjust for case like '1 egg' or '1/2 onion, chopped'
            if len(line_split) > 2:
                measurement = line_split[1]
    else:
        measurement = None

    # append quantity in backets at the end
    if quantity_in_brackets:
        quantity_split.append(quantity_in_brackets)

    return ' '.join(quantity_split), measurement, preparation


def extract_ingredient_name(line):
    quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
    if measurement is None:
        measurement = ''

    if preparation:
        return line[len(quantity + ' ' + measurement): -(len(preparation) + 2)].strip()
    else:
        return line[len(quantity + ' ' + measurement):].strip()

def extract_descriptor(ingredient_name):
    descriptor = []
    token_tag_pairs = tokenize(ingredient_name)
    for pair in token_tag_pairs:
        if pair[1] == "JJ" or pair[1] == "VBN":
            descriptor.append(pair[0])
    if len(descriptor) != 0:
        return ' '.join(descriptor)


def check_noun_num(token_tag_pairs):
    tag_num_dict = {}
    for pair in token_tag_pairs:
        if pair[1] not in tag_num_dict:
            tag_num_dict[pair[1]] = 1
        else:
            tag_num_dict[pair[1]] += 1

    criterion_1 = 'NN' in tag_num_dict and tag_num_dict['NN'] >= 2
    criterion_2 = 'NNS' in tag_num_dict and tag_num_dict['NNS'] >= 2
    criterion_3 = 'NN' in tag_num_dict and 'NNS' in tag_num_dict and tag_num_dict['NN'] + tag_num_dict['NNS'] >= 2

    if criterion_1 or criterion_2 or criterion_3:
        return True
    else:
        return False
def extract_ingredients_nouns(line):
    ingredients_nouns = set()
    token_tag_pairs = tokenize(line)
    for pair in token_tag_pairs:
        if pair[1] == 'NN' or pair[1] == 'NNS':
            # global ingredients_nouns
            ingredients_nouns |= {pair[0]}

    return list(ingredients_nouns)

def get_ingredient_lines_from_url(url):
    # page = http.request('GET', url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # extract ingredients section from the webpage
    ingredients = set([element.label.text.strip() for element in soup.find_all(class_='checkList__line')])

    # remove unnecessary elements
    ingredients.remove('')
    ingredients.remove('Add all ingredients to list')

    return ingredients

def get_ingredient_from_url(url):
    ingredients = get_ingredient_lines_from_url(url)

    ingredient_list = []
    seasonings = []

    for line in ingredients:
        quantity, measurement, preparation = extract_quantity_measurement_preparation(line)
        if ':' in line:
            # exceptions like "topping:"
            continue
        ingredient_name = extract_ingredient_name(line)
        descriptor = extract_descriptor(ingredient_name)

        # ingredient_list.extend(extract_ingredients_nouns(ingredient_name))
        if measurement is not None and ('teaspoon' in measurement or 'tablespoon' in measurement):
           seasonings.extend(extract_ingredients_nouns(ingredient_name))
        else:
            ingredient_list.append(ingredient_name)

    return ingredient_list, seasonings

def get_recipe_urls(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    tags = soup.find_all('div', class_='fixed-recipe-card__info')

    # pages = '?page=2'

    recipe_urls = []
    for tag in tags:
        url = tag.find('a')['href']
        print(url)
        recipe_urls.append(url)
    return recipe_urls


def get_all_ingredient(url, page = 1):
    ingredients = []
    seasonings = []
    recipe_urls = get_recipe_urls(url)
    for v_url in recipe_urls:
        temp, season= get_ingredient_from_url(v_url)
        ingredients += temp
        seasonings += season
        print(temp)
        time.sleep(3)
    print(ingredients)
    return ingredients, seasonings

In [4]:
# with open('healthy_scrape_ingredient.json', 'r') as f:
#     ingredients = json.load(f)
#     print(ingredients.keys())

In [5]:
ingredients = dict()
ingredients['healthy'] = []
ingredients['seasonings'] = []
seasonings = ingredients['seasonings']
page = '3'
temp = ingredients['healthy']
healthy_url = 'https://www.allrecipes.com/recipes/84/healthy-recipes/?page='+page
ingre, season = get_all_ingredient(healthy_url)
seasonings += season
temp += ingre
ingredients['healthy'] = temp
ingredients['seasonings'] = seasonings

https://www.allrecipes.com/recipe/16259/ds-famous-salsa/
https://www.allrecipes.com/recipe/16729/old-fashioned-potato-salad/
https://www.allrecipes.com/recipe/142614/grilled-fish-tacos-with-chipotle-lime-dressing/
https://www.allrecipes.com/recipe/229733/simple-roasted-butternut-squash/
https://www.allrecipes.com/recipe/73757/penne-pasta-with-spinach-and-bacon/
https://www.allrecipes.com/recipe/52635/low-fat-blueberry-bran-muffins/
https://www.allrecipes.com/recipe/57783/emilys-famous-hash-browns/
https://www.allrecipes.com/recipe/44975/easy-pizza-sauce-iii/
https://www.allrecipes.com/recipe/23336/fantastic-black-bean-chili/
https://www.allrecipes.com/recipe/7159/morning-glory-muffins-i/
https://www.allrecipes.com/recipe/93666/spinach-and-feta-pita-bake/
https://www.allrecipes.com/recipe/89195/zucchini-and-potato-bake/
https://www.allrecipes.com/recipe/46982/pesto-pasta-with-chicken/
https://www.allrecipes.com/recipe/54165/balsamic-bruschetta/
https://www.allrecipes.com/recipe/78023/be

In [9]:
while '' in ingredients['healthy']:
    ingredients['healthy'].remove('')

In [10]:
ingredients['healthy']

['canned sliced green chiles',
 'cans stewed tomatoes',
 'lime',
 'onion',
 'pepper to taste',
 'eggs',
 'sweet pickle relish',
 'potatoes',
 'chopped onion',
 'mayonnaise',
 'chopped celery',
 'tilapia fillets',
 'extra virgin olive oil',
 'tortillas',
 'cilantro',
 'light sour cream',
 'tomatoes',
 'pepper to taste',
 'limes',
 'adobo sauce from chipotle peppers',
 'head cabbage',
 'garlic',
 'squash - peeled, seeded, and cut into 1-in',
 'ground black pepper to taste',
 'garlic',
 'penne pasta',
 'fresh spinach, rinsed and',
 'bacon',
 'can diced tomatoes',
 'brown sugar',
 'blueberries',
 'whole wheat flour',
 'nonfat milk',
 'all-purpose flour',
 'egg',
 'unsweetened applesauce',
 'wheat bran',
 'pepper to taste',
 'onion',
 'egg',
 'russet potatoes',
 'all-purpose flour',
 'oil for frying',
 'tomato sauce',
 'tomato paste',
 'ground turkey',
 'can crushed tomatoes',
 'black beans',
 'garlic',
 'onion',
 'all-purpose flour',
 'apple butter',
 'vegetable oil',
 'white sugar',
 'rai

In [13]:
import json

In [14]:
file_name = 'healthy_ingredient.json'
with open(file_name,'w') as file_object:
    json.dump(ingredients,file_object)