In [12]:
import pandas as pd
from recipe_scrapers import scrape_me
import time
import random
import json
import pymongo
from tqdm.notebook import tqdm

In [5]:
def wait():
    
    x = random.randrange(1, 200, 1)/100
    print(f"Waiting for {x} Seconds.", end='\r')
    time.sleep(x)

In [6]:
# Db Information
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
urls = db['urls']
recipes = db['recipes']

In [14]:
list(urls.find({"read":True}))

[{'_id': 'https://www.food.com/recipe/elephants-mudbath-217537',
  'name': 'elephants-mudbath',
  'read': True,
  'type': ['african'],
  'source': 'food_com',
  'website_id': '217537'},
 {'_id': 'https://www.food.com/recipe/olive-garden-asparagus-with-lemon-and-minced-onions-351063',
  'name': 'olive-garden-asparagus-with-lemon-and-minced-onions',
  'read': True,
  'type': ['asparagus', 'asparagus-side-dish'],
  'source': 'food_com',
  'website_id': '351063'},
 {'_id': 'https://www.food.com/recipe/quick-pizza-sauce-with-fennel-409930',
  'name': 'quick-pizza-sauce-with-fennel',
  'read': True,
  'type': ['pizza-sauce'],
  'source': 'food_com',
  'website_id': '409930'}]

# Scraper
Take the url database and start retrieving the actual recipe contents to store in a new database dedicated to the recipes. Given that this database will form the basis for an actual app and analysis the key unique id will be generated by Mongo and keep close tabs with the url database to ensure no read recipes are reread.

In [8]:
eligible_urls = [x["_id"] for x in list(urls.find({"read":False}, {"_id":1}))]
eligible_urls = sorted(eligible_urls, key = lambda x: random.random())

In [13]:
for url in tqdm(eligible_urls):
    
    # Obtain existing data
    print(url)
    row = list(urls.find({"_id":url}))[0]
    
    # Pause and then scrape
    wait()
    scraper = scrape_me(url)
    
    # Attempt each piece of data
    
    # Title
    try:
        title = scraper.title()
    except:
        try:
            title = row['name']
        except:
            title = None
            
    # Total Time
    try:
        total_time = scraper.total_time()
    except:
        total_time = None
        
    # Yields
    try:
        yields = scraper.yields()
    except:
        yields = None
        
    # Ingredients
    try:
        ingredients = scraper.ingredients()
    except:
        ingredients = []
        
    # Instructions
    try:
        instructions = scraper.instructions()
    except:
        instructions = []
        
    # Image
    try:
        image = scraper.image()
    except:
        image = None
        
    # Ratings
    try:
        rating = scraper.ratings()
    except:
        rating = None
        
    # Author
    try:
        author = scraper.author()
    except:
        try:
            author = row['author']
        except:
            author = None
            
    # Reviews
    try:
        reviews = scraper.reviews()
    except:
        reviews = None
    
    # Insert new data
    recipes.insert_one({"title":title,
                        "total_time":total_time,
                        "yields":yields,
                        "ingredients":ingredients,
                        "instructions":instructions,
                        "image":image,
                        "rating":rating,
                        "author":author,
                        "reviews":reviews,
                        "source":row['source'],
                        "url":row["_id"]})
    
    # Mark as read
    query = {"_id":url}
    newvalues = {"$set":{"read":True}}
    urls.update_one(query, newvalues)

HBox(children=(FloatProgress(value=0.0, max=220842.0), HTML(value='')))

https://www.food.com/recipe/quick-pizza-sauce-with-fennel-409930
https://www.food.com/recipe/elephants-mudbath-217537
https://www.food.com/recipe/olive-garden-asparagus-with-lemon-and-minced-onions-351063
//www.foodnetwork.com/recipes/aaron-mccargo-jr/apple-cranberry-bread-pudding-recipe-1946155
Waiting for 0.66 Seconds.

WebsiteNotImplementedError: Website () is not supported