In [7]:
import pandas as pd
from recipe_scrapers import scrape_me
import time
import random
import json
import pymongo

In [13]:
def wait():
    
    x = random.randrange(2, 8, 1)
    print(f"Waiting {x}", end='\r')
    time.sleep(x)

In [9]:
# Db Information
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
urls = db['urls']

# Ongoing Sources
Sites with massive databases too large to scrape in one or two sessions.

### AllRecipes

In [6]:
page = scrape_me('https://www.epicurious.com/search/?content=recipe&sort=mostReviewed&page=2')
page.links()

[{'href': '/', 'itemprop': 'url', 'title': 'Epicurious', 'data-reactid': '5'},
 {'href': '/recipes/food/views/peach-cobbler-102277', 'data-reactid': '71'},
 {'aria-label': 'Peach Cobbler',
  'class': ['photo-link'],
  'href': '/recipes/food/views/peach-cobbler-102277',
  'data-reactid': '88'},
 {'class': ['view-complete-item'],
  'href': '/recipes/food/views/peach-cobbler-102277',
  'itemprop': 'url',
  'title': 'Peach Cobbler',
  'data-reactid': '91'},
 {'class': ['view-complete-item'],
  'href': '/recipes/food/views/peach-cobbler-102277',
  'data-reactid': '96'},
 {'class': ['show-quick-view'],
  'href': '/recipes/food/views/peach-cobbler-102277',
  'title': 'Peach Cobbler',
  'data-reactid': '98'},
 {'href': '/recipes/food/views/beef-stroganoff-102134', 'data-reactid': '104'},
 {'aria-label': 'Beef Stroganoff',
  'class': ['photo-link'],
  'href': '/recipes/food/views/beef-stroganoff-102134',
  'data-reactid': '121'},
 {'class': ['view-complete-item'],
  'href': '/recipes/food/views

### Food.com

In [None]:
# Build Topic List
try:
    topic_file = open('./scraped_urls/food_com_topics.json')
    topics = json.load(topic_file)
except:
    topics = {}
    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    for letter in alphabet:
        path = 'https://www.food.com/topic/'+letter
        topics[letter] = {}
        wait()

        for link in scrape_me(path).links():
            if 'topic/' in link['href']:
                topics[letter][link['href']] = 1

    with open('./scraped_urls/food_com_topics.json', 'w') as json_file:
        json.dump(topics, json_file)

In [None]:
# Easy Page structure
num_pages = 12
for letter in topics:
    for topic in topics[letter]:
        
        c_name = topic.split('www.food.com/topic/')[1]
        if topics[letter][topic] == 'end':
            print(topic + " is Complete.")
            continue
        else:
            start = topics[letter][topic]
            stop = topics[letter][topic]+num_pages

            print("Scraping "+topic + " From " + str(start) + " to " + str(stop))
            for i in range(topics[letter][topic], topics[letter][topic]+num_pages):
                wait()
                page = scrape_me(topic+"?pn="+str(i)).links()
                
                if len(page) < 40:
                    topics[letter][topic] = 'end'
                    break
                else:
                    for link in page:
                        if 'recipe/' in link['href']:
                            recipe = link['href']
                            
                            # Insert checks
                            if recipe in recipes:
                                types = list(urls.find({"_id":recipe}, {"_id":0, "type":1}))[0]['type']
                                if c_name not in types:
                                    types.append(c_name)
                                    query = {"_id":recipe}
                                    newvalues = {"$set":{"type":types}}
                                    urls.update_one(query, newvalues)
                            
                            else:
                                urls.insert_one({'_id':recipe,
                                  'name':"-".join(recipe.split("/")[-1].split("-")[:-1]),
                                  'read':False,
                                  'type':[c_name],
                                  'source':'food_com',
                                  'website_id':recipe.split("/")[-1].split("-")[-1]})
                                recipes.append(recipe)
                    
                    # Save incrementally to avoid not counting progress
                    topics[letter][topic] += 1
                    with open('./scraped_urls/food_com_topics.json', 'w') as json_file:
                        json.dump(topics, json_file)
        print(f"Total Scraped at {len(recipes)}.")
        print(f"Total in Database at {len(list(urls.find({'source':'food_com'})))}.")

# One-off Sites
Sites with low amounts of recipes able to scrape in one or two sessions

### Epicurious

In [14]:
source = 'epicurious'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [18]:
for i in range(2030, 2050):
    print(f"Page {i}/{2050}")
    wait()
    
    page = scrape_me('https://www.epicurious.com/search/?content=recipe&sort=mostReviewed&page='+str(i)).links()
    for link in page:
        if 'itemprop' in link.keys():
            recipe = "https://www.epicurious.com"+link['href']
            if recipe not in recipe_list:
                urls.insert_one({"_id":recipe,
                                 "name":link['title'],
                                 'read':False,
                                 'type':[],
                                 'source':source,
                                 'website_id':link['href'].split("/")[-1].split("-")[-1],
                                 'react_id':link['data-reactid']})
                recipe_list.append(recipe)
    print(f"Recipes Scraped: {len(recipe_list)}")

Page 2000/1800
Recipes Scraped: 36001
Page 2001/1800
Recipes Scraped: 36019
Page 2002/1800
Recipes Scraped: 36037
Page 2003/1800
Recipes Scraped: 36055
Page 2004/1800
Recipes Scraped: 36073
Page 2005/1800
Recipes Scraped: 36091
Page 2006/1800
Recipes Scraped: 36109
Page 2007/1800
Recipes Scraped: 36127
Page 2008/1800
Recipes Scraped: 36145
Page 2009/1800
Recipes Scraped: 36163
Page 2010/1800
Recipes Scraped: 36181
Page 2011/1800
Recipes Scraped: 36199
Page 2012/1800
Recipes Scraped: 36217
Page 2013/1800
Recipes Scraped: 36235
Page 2014/1800
Recipes Scraped: 36253
Page 2015/1800
Recipes Scraped: 36271
Page 2016/1800
Recipes Scraped: 36289
Page 2017/1800
Recipes Scraped: 36307
Page 2018/1800
Recipes Scraped: 36325
Page 2019/1800
Recipes Scraped: 36343
Page 2020/1800
Recipes Scraped: 36361
Page 2021/1800
Recipes Scraped: 36379
Page 2022/1800
Recipes Scraped: 36397
Page 2023/1800
Recipes Scraped: 36415
Page 2024/1800
Recipes Scraped: 36433
Page 2025/1800
Recipes Scraped: 36451
Page 2026/18

KeyboardInterrupt: 

### Great British Chefs