In [77]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import time

###  Helper functions for http requests

In [2]:
def get_raw_html(url):
    """
    Use HTTP GET request to get raw html content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if response is succesful (status 200) and looks like html content,
    False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    Keep track of errors
    TODO: expand this function to handle issues
    """
    print(e)

### Get recipe's html content

In [3]:
url = 'https://www.allrecipes.com/recipe/60598/vegetarian-korma/?internalSource=hub%20recipe&referringContentType=Search'
raw_html = simple_get(url)
# apply html parser
soup = BeautifulSoup(raw_html, 'html.parser')

### Extract values

#### Get tittle

In [57]:
title = soup.title.text[:-17] # -17 to remove site's name
title

'Vegetarian Korma Recipe'

#### Get ingredients

In [13]:
li_checklist = soup.find_all('li', attrs={"class": "checkList__line"})
ingredients = [li.span.text for li in li_checklist]

#### Get time info

In [50]:
t_prep = soup.find('time', attrs={"itemprop": "prepTime"})
t_cook = soup.find('time', attrs={"itemprop": "prepTime"})
t_total = soup.find('time', attrs={"itemprop": "prepTime"})

time_info = {
    # Transform to int and remove ' m' at the end (with -2)
    "prep_time": int(t_prep.text[:-2]),
    "cook_time": int(t_cook.text[:-2]),
    "total_time": int(t_total.text[:-2]),
}
time_info

{'prep_time': 25, 'cook_time': 25, 'total_time': 25}

##  Scrapper func for All recipes

In [92]:
def scrap_allrecipes(url):
    """
    Takes a URL for a recipe from All Rrecipes and extracts title, time info, and ingredients
    
    Sample URL: https://www.allrecipes.com/recipe/60598/vegetarian-korma/?internalSource=hub%20recipe&referringContentType=Search
    """
    t_start = time.time()
    print("Scrapping begins...")
    print(url)
    
    # Get Recipes html content 
    raw_html = simple_get(url)
    soup = BeautifulSoup(raw_html, 'html.parser')

    # get title
    title = soup.title.text[:-17] # -17 to remove site's name

    # get ingredients
    li_checklist = soup.find_all('li', attrs={"class": "checkList__line"})
    ingredients = [li.span.text for li in li_checklist]

    # get time info
    t_prep = soup.find('time', attrs={"itemprop": "prepTime"})
    t_cook = soup.find('time', attrs={"itemprop": "prepTime"})
    t_total = soup.find('time', attrs={"itemprop": "prepTime"})

    time_info = {
        # Transform to int and remove ' m' at the end (with -2)
        "prep_time": int(t_prep.text[:-2]),
        "cook_time": int(t_cook.text[:-2]),
        "total_time": int(t_total.text[:-2]),
    }

    # Output
    recipe_info = {
        'title': title,
        'ingredients': ingredients[:-3], # remove last 3 items, non-ingredients
        'time_info': time_info,
        'url': url
    }
    print("Finished! (%s)" %(time.time()-t_start))
    return recipe_info

In [72]:
recipes_url = [
    'https://www.allrecipes.com/recipe/244204/miso-oatmeal-bowl/',
    'https://www.allrecipes.com/recipe/16641/red-lentil-curry/',
    'https://www.allrecipes.com/recipe/60598/vegetarian-korma/',
    'https://www.allrecipes.com/recipe/257938/spicy-thai-basil-chicken-pad-krapow-gai/',
    'https://www.allrecipes.com/recipe/246628/spaghetti-cacio-e-pepe'
]

In [93]:
top_5 = [scrap_allrecipes(recipe) for recipe in recipes_url]
len(top_5)

Scrapping begins...
https://www.allrecipes.com/recipe/244204/miso-oatmeal-bowl/
Finished! (1.2146224975585938)
Scrapping begins...
https://www.allrecipes.com/recipe/16641/red-lentil-curry/
Finished! (1.2144250869750977)
Scrapping begins...
https://www.allrecipes.com/recipe/60598/vegetarian-korma/
Finished! (1.2152290344238281)
Scrapping begins...
https://www.allrecipes.com/recipe/257938/spicy-thai-basil-chicken-pad-krapow-gai/
Finished! (1.172544240951538)
Scrapping begins...
https://www.allrecipes.com/recipe/246628/spaghetti-cacio-e-pepe
Finished! (1.2995188236236572)


5

In [95]:
top_5[2]

{'title': 'Vegetarian Korma Recipe',
 'ingredients': ['1 1/2 tablespoons vegetable oil',
  '1 small onion, diced',
  '1 teaspoon minced fresh ginger root',
  '4 cloves garlic, minced',
  '2 potatoes, cubed',
  '4 carrots, cubed',
  '1 fresh jalapeno pepper, seeded and sliced',
  '3 tablespoons ground unsalted cashews',
  '1 (4 ounce) can tomato sauce',
  '2 teaspoons salt',
  '1 1/2 tablespoons curry powder',
  '1 cup frozen green peas',
  '1/2 green bell pepper, chopped',
  '1/2 red bell pepper, chopped',
  '1 cup heavy cream',
  '1 bunch fresh cilantro for garnish'],
 'time_info': {'prep_time': 25, 'cook_time': 25, 'total_time': 25},
 'url': 'https://www.allrecipes.com/recipe/60598/vegetarian-korma/'}