In [9]:
import pandas as pd
import requests
import bs4
import time
import numpy as np

In [10]:
url = 'https://www.jamieoliver.com/recipes/category/course/mains/'
response = requests.get(url=url)
parser = bs4.BeautifulSoup(response.text)

In [11]:
response.status_code


200

In [12]:
links = parser.find_all("a")
recipes = parser.find('div', attrs={'class': 'row recipe-row infinite-scroll'})

In [13]:
recipe_urls = pd.Series([a.get("href") for a in recipes.find_all("a")])
recipe_urls

0         /recipes/vegetable-recipes/asparagus-stir-fry/
1          /recipes/fish-recipes/sweet-potato-fishcakes/
2          /recipes/chicken-recipes/spring-chicken-stew/
3              /recipes/chicken-recipes/chicken-goujons/
4      /recipes/vegetable-recipes/sweet-and-sour-stir...
                             ...                        
898    /recipes/chicken-recipes/chicken-squash-caccia...
899    /recipes/vegetables-recipes/apple-amp-lancashi...
900    /recipes/chicken-recipes/all-in-one-rice-amp-c...
901          /recipes/vegetables-recipes/aubergine-daal/
902    https://www.jamieoliver.com/recipes/category/c...
Length: 903, dtype: object

In [14]:
recipe_urls = recipe_urls[
    (recipe_urls.str.startswith("https") == False)
].unique()

In [15]:
len(recipe_urls)

901

In [16]:
recipe_url_df = pd.DataFrame( {"recipe_urls" : recipe_urls} )
recipe_url_df['recipe_urls'] = "https://www.jamieoliver.com" + recipe_url_df['recipe_urls'].astype('str')

In [17]:
recipe_url_df.to_csv(r"/Users/arina/study/ds/project/food_recommendation/input/oliver/recipe_urls.csv", 
sep="\t", index=False)

In [19]:
recipe_url_df

Unnamed: 0,recipe_urls
0,https://www.jamieoliver.com/recipes/vegetable-...
1,https://www.jamieoliver.com/recipes/fish-recip...
2,https://www.jamieoliver.com/recipes/chicken-re...
3,https://www.jamieoliver.com/recipes/chicken-re...
4,https://www.jamieoliver.com/recipes/vegetable-...
...,...
896,https://www.jamieoliver.com/recipes/vegetables...
897,https://www.jamieoliver.com/recipes/chicken-re...
898,https://www.jamieoliver.com/recipes/vegetables...
899,https://www.jamieoliver.com/recipes/chicken-re...


In [None]:
url = 'https://www.jamieoliver.com/recipes/pasta-recipes/beautiful-courgette-penne-carbonara/'
soup = bs4.BeautifulSoup(requests.get(url).content)
print(soup.find("h1").text.strip())

<h1 class="hidden-xs">Beautiful courgette carbonara</h1>


In [None]:
ingredients = []
for li in soup.select(".ingred-list li"):
    ingred = " ".join(li.text.split())
    ingredients.append(ingred)
print(ingredients)

['6 medium green and yellow courgettes', '500 g penne', '4 large eggs', '100 ml single cream', '1 small handful of Parmesan cheese', 'olive oil', '6 slices of back bacon', '½ a bunch of fresh thyme , (15g)', 'a few courgette flowers , (optional)']


In [24]:
class JamieOliver():
    def __init__(self, url):
        self.url = url 
        self.soup = bs4.BeautifulSoup(requests.get(url).content)
    
    def recipe_name(self):
        try:
            return self.soup.find('h1').text.strip()
        except: 
            return np.nan
        
    def serves(self):
        try:
            return self.soup.find('div', {'class': 'recipe-detail serves'}).text.split(' ',1)[1]
        except:
            return np.nan 

    def cooking_time(self):
        try:
            return self.soup.find('div', {'class': 'recipe-detail time'}).text.split('In')[1]
        except:
            return np.nan


    def difficulty(self):
        try:
            return self.soup.find('div', {'class': 'col-md-12 recipe-details-col remove-left-col-padding-md'}).text.split('Difficulty')[1]
        except:
            return np.nan

    def ingredients(self):
        try:
            ingredients = [] 
            for li in self.soup.select('.ingred-list li'):
                ingred = ' '.join(li.text.split())
                ingredients.append(ingred)
            return ingredients
        except:
            return np.nan

In [28]:

# Reads in the csv containing each recipes url
recipe_df = pd.read_csv("/Users/arina/study/ds/project/food_recommendation/input/oliver/recipe_urls.csv")
# The list of recipe attributes we want to scrape
attribs = ['recipe_name', 'serves', 'cooking_time', 'difficulty', 'ingredients']

# For each url (i) we add the attribute data to the i-th row
temp = pd.DataFrame(columns=attribs)
for i in range(0, len(recipe_df['recipe_urls'])):
    url = recipe_df['recipe_urls'][i]
    recipe_scraper = JamieOliver(url)
    temp.loc[i] = [getattr(recipe_scraper, attrib)() for attrib in attribs]
    if i % 25 == 0:
        print(f'Step {i} completed')
    time.sleep(0.005)

temp['recipe_urls'] = recipe_df['recipe_urls']
columns = ['recipe_urls'] + attribs
temp = temp[columns]

Step 0 completed
Step 25 completed
Step 50 completed
Step 75 completed
Step 100 completed
Step 125 completed
Step 150 completed
Step 175 completed
Step 200 completed
Step 225 completed
Step 250 completed
Step 275 completed
Step 300 completed
Step 325 completed
Step 350 completed
Step 375 completed
Step 400 completed
Step 425 completed
Step 450 completed
Step 475 completed
Step 500 completed
Step 525 completed
Step 550 completed
Step 575 completed
Step 600 completed
Step 625 completed
Step 650 completed
Step 675 completed
Step 700 completed
Step 725 completed
Step 750 completed
Step 775 completed
Step 800 completed
Step 825 completed
Step 850 completed
Step 875 completed
Step 900 completed


In [29]:
temp.to_csv(r"/Users/arina/study/ds/project/food_recommendation/input/oliver/oliver_ds.csv", index=False)