In [1]:
import pandas as pd
from recipe_scrapers import scrape_me
import time
import random
import json
import pymongo
from tqdm.notebook import tqdm
import numpy as np

In [2]:
def wait():
    
    x = random.randrange(50, 200, 1)/100
    print(f"Waiting for {x} Seconds.", end='\r')
    time.sleep(x)

In [3]:
# Db Information
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
urls = db['urls']
recipes = db['recipes']

# Scraper
Take the url database and start retrieving the actual recipe contents to store in a new database dedicated to the recipes. Given that this database will form the basis for an actual app and analysis the key unique id will be generated by Mongo and keep close tabs with the url database to ensure no read recipes are reread.

In [None]:
eligible_urls = [x["_id"] for x in list(urls.find({"read":False}, {"_id":1}))]
eligible_urls = sorted(eligible_urls, key = lambda x: random.random())

for url in tqdm(eligible_urls):
    
    # Obtain existing data
    print(url)
    row = list(urls.find({"_id":url}))[0]
    
    
    try:
        if url[0] != 'h':
            url = "https:" + url

        # Pause and then scrape
        wait()
        scraper = scrape_me(url)

        # Attempt each piece of data

        # Title
        try:
            title = scraper.title()
        except:
            try:
                title = row['name']
            except:
                title = None

        # Total Time
        try:
            total_time = scraper.total_time()
        except:
            total_time = None

        # Yields
        try:
            yields = scraper.yields()
        except:
            yields = None

        # Ingredients
        try:
            ingredients = scraper.ingredients()
        except:
            ingredients = []

        # Instructions
        try:
            instructions = scraper.instructions().encode('utf-8', 'surrogatepass')
        except:
            instructions = None

        # Image
        try:
            image = scraper.image()
        except:
            image = None

        # Ratings
        try:
            rating = scraper.ratings()
        except:
            rating = None

        # Author
        try:
            author = scraper.author()
        except:
            try:
                author = row['author']
            except:
                author = None

        # Reviews
        try:
            reviews = scraper.reviews()
        except:
            reviews = None

        # Insert new data
        recipes.insert_one({"title":title,
                            "total_time":total_time,
                            "yields":yields,
                            "ingredients":ingredients,
                            "instructions":instructions,
                            "image":image,
                            "rating":rating,
                            "author":author,
                            "reviews":reviews,
                            "source":row['source'],
                            "url":row["_id"]})

        # Mark as read
        query = {"_id":url}
        newvalues = {"$set":{"read":True}}
        urls.update_one(query, newvalues)
        
    except:
        
        # Mark as Error
        print("Error encountered.")
        query = {"_id":url}
        newvalues = {"$set":{"error":True}}
        urls.update_one(query, newvalues)

HBox(children=(FloatProgress(value=0.0, max=6884.0), HTML(value='')))

https://www.epicurious.com/recipes/food/views/sauteed-halibut-with-arugula-roasted-beets-and-horseradish-creme-fraiche-358351
https://www.cookstr.com/recipes/pappardelle-with-escarole
https://www.epicurious.com/recipes/food/views/chicken-cutlets-marinated-in-lime-juice-106259
https://www.food.com/recipe/goat-cheese-cakes-to-top-your-salad-307129
https://www.food.com/recipe/izakaya-sakura-sesame-avocado-brown-rice-495768
https://www.foodnetwork.com/recipes/robin-miller/reinvented-green-beans-with-orange-vinaigrette-recipe-1944999
https://www.food.com/recipe/quakers-ham-salad-223868
https://www.foodnetwork.com/recipes/alton-brown/coffee-granita-recipe2-1909813
https://www.food.com/recipe/crispy-chicken-thighs-56403
https://www.epicurious.com/recipes/food/views/sesame-tuna-burgers-with-fried-shoestring-zucchini-235476
https://www.lecremedelacrumb.com/summer-melon-fruit-salad/
https://www.food.com/recipe/jalapeno-corn-casserole-475267
https://www.food.com/recipe/sherried-butter-nut-drops-4

https://www.food.com/recipe/one-pot-chicken-broccoli-in-white-wine-sauce-130969
https://www.food.com/recipe/spinach-salad-dressing-241559
https://www.epicurious.com/recipes/food/views/seafood-salad-352250
https://www.food.com/recipe/mashed-potato-salad-33925
https://www.foodnetwork.com/recipes/sunny-anderson/sunnys-tater-tot-pie-2737790
https://www.southernliving.com/recipes/chocolate-sugar-cookie-bars
https://www.southernliving.com/recipes/carrot-cake-cheesecake
https://www.epicurious.com/recipes/food/views/fresh-strawberry-granita-232106
https://www.food.com/recipe/black-bean-salad-307203
https://www.epicurious.com/recipes/food/views/cauliflower-carbonara
https://www.food.com/recipe/sour-cream-blue-cheese-dip-230399
https://www.food.com/recipe/chicken-pot-pie-with-flaky-crust-472538
https://www.food.com/recipe/gingery-plum-jam-16169
https://www.food.com/recipe/apple-wheat-bread-bread-machine-357191
https://www.food.com/recipe/balsamic-roast-mushrooms-301344
https://www.food.com/recip

https://www.epicurious.com/recipes/food/views/fish-house-punch-ii-200760
https://www.closetcooking.com/cajun-bbq-shrimp-scampi-linguine/
https://www.food.com/recipe/winter-cold-chaser-chicken-stew-slow-cooker-402684
https://www.food.com/recipe/phyllo-topped-apple-pie-73026
https://www.epicurious.com/recipes/food/views/fresh-coconut-cake-103085
https://www.food.com/recipe/incredible-banana-coffee-cake-36374
https://www.food.com/recipe/slow-cooker-indian-rice-pudding-482859
https://www.food.com/recipe/chicken-n-pepper-stir-fry-243914
https://www.food.com/recipe/picante-chili-139943
https://www.cookstr.com/recipes/wild-rice-and-smoked-turkey-salad-with-dried-cranberries-and-toasted-hazelnuts
https://www.food.com/recipe/broiled-swordfish-steaks-with-horseradish-sauce-155728
https://www.food.com/recipe/simple-marinara-sauce-108317
https://www.epicurious.com/recipes/food/views/yellow-gazpacho-243271
https://www.epicurious.com/recipes/food/views/barbecued-pork-burgers-with-slaw-239296
https:/

https://www.food.com/recipe/ranch-salad-dressing-381074
https://www.jamieoliver.com/recipes/cheese-recipes/mozzarella-winter-fruit-salad/
https://www.foodnetwork.com/recipes/aarti-sequeira/chocolate-chip-date-french-toast-recipe-2042260
https://www.epicurious.com/recipes/food/views/albondigas-soup-25
https://www.foodnetwork.com/recipes/giada-de-laurentiis/spinach-and-mushroom-ravioli-recipe2-1953189
https://www.food.com/recipe/rachael-rays-yakitori-noodle-bowls-remixed-349562
https://www.gonnawantseconds.com/classic-pumpkin-pie/
https://www.food.com/recipe/the-worlds-best-bread-machine-pizza-dough-recipe-131607
https://www.food.com/recipe/alfredo-potato-casserole-71188
https://www.food.com/recipe/eye-of-round-crock-pot-roast-108277
https://www.cookstr.com/recipes/garlic-roasted-asparagus-with-almonds
https://www.food.com/recipe/crock-pot-corned-beef-cabbage-292159
https://www.food.com/recipe/beef-stroganoff-339049
https://copykat.com/cracker-barrel-cucumbers-tomatoes-and-onions/
https:

https://www.epicurious.com/recipes/food/views/raspberry-champagne-cocktail-102809
https://www.food.com/recipe/halloween-cookie-bark-488456
https://www.food.com/recipe/american-chop-suey-11088
https://www.food.com/recipe/wild-rice-in-vermouth-457196
https://www.food.com/recipe/natasha-a-refreshing-drink-475686
https://www.food.com/recipe/asparagus-sauteed-in-butter-and-mustard-478829
https://www.food.com/recipe/irish-mist-bars-278027
https://www.epicurious.com/recipes/food/views/xiao-jianmings-spareribs-with-chiles-107983
https://www.epicurious.com/recipes/food/views/south-side-201040
https://www.food.com/recipe/creamy-lemon-chicken-rice-524076
https://www.food.com/recipe/fruit-kebabs-with-yogurt-and-honey-dip-173221
https://www.food.com/recipe/buffalo-chicken-and-noodle-ramekins-rsc-494609
https://www.food.com/recipe/swiss-meringue-buttercream-frosting-538063
https://www.food.com/recipe/mincemeat-apple-pie-192882
https://www.southernliving.com/recipes/make-ahead-yeast-rolls-recipe
http

https://www.food.com/recipe/pesto-baked-chicken-with-fresh-asparagus-523224
https://www.epicurious.com/recipes/food/views/braised-italian-style-pot-roast-51119610
https://www.food.com/recipe/mixed-greens-salad-with-grilled-halloumi-and-anaheim-chilis-432744
https://www.epicurious.com/recipes/food/views/scallops-and-cauliflower-with-caper-raisin-sauce-15422
https://www.food.com/recipe/jalapeno-popper-grilled-cheese-sandwich-471266
https://www.food.com/recipe/awesome-artichoke-dip-90079
https://www.food.com/recipe/limoncello-syllabub-with-crushed-amaretti-cookies-305662
https://www.food.com/recipe/bacon-and-sweet-corn-pasta-461613
https://www.101cookbooks.com/pasta-with-smashed-zucchini-cream/
https://www.food.com/recipe/roasted-asparagus-with-dijon-lemon-sauce-512508
https://copykat.com/north-woods-inn-garlic-cheese-bread-this-goes-so-well-with-a-bowl-of-home-made-soup/
https://www.food.com/recipe/fresh-ginger-relish-29522
https://www.epicurious.com/recipes/food/views/chili-chicken-wing

https://www.food.com/recipe/simple-hot-roast-beef-sandwiches-with-mushroom-au-jus-277128
https://www.epicurious.com/recipes/food/views/classic-french-fries-104163
https://www.epicurious.com/recipes/food/views/mashed-potatoes-363285
https://www.foodnetwork.com/recipes/patrick-and-gina-neely/ginas-citrus-wine-spritzer-recipe-1919554
https://www.foodnetwork.com/recipes/anne-burrell/olive-oil-cake-with-blueberries-and-mascarpone-recipe-1946850
https://www.mybakingaddiction.com/perfect-party-cupcakes/
https://www.food.com/recipe/sour-cream-zucchini-bread-385375
https://www.food.com/recipe/sauteed-vegetables-with-pan-grilled-chicken-and-eggplant-396570
https://www.cookstr.com/recipes/endive-salad-with-meyer-lemon-fava-beans-and-oil-cured-olives
Error encountered.econds.
https://www.food.com/recipe/sowf-calina-bbq-chicken-34995
https://www.food.com/recipe/orange-pumpkin-bread-25654
https://www.food.com/recipe/susies-bleu-cheese-ball-with-a-holiday-presentation-70072
https://www.food.com/recip

https://www.epicurious.com/recipes/food/views/hard-boiled-eggs-and-parmesan-on-toasted-sourdough-106956
https://www.food.com/recipe/1-2-3-cheddar-broccoli-casserole-348374
https://www.food.com/recipe/quick-and-easy-chicken-enchiladas-320929
https://www.food.com/recipe/three-meat-calzone-337423
https://www.food.com/recipe/baked-apple-for-one-101103
https://www.mybakingaddiction.com/banana-bars/
Error encountered.econds.
https://www.food.com/recipe/tomato-stew-lecso-496069
https://www.epicurious.com/recipes/food/views/dijon-mustard-vinaigrette-1859
https://www.food.com/recipe/homemade-pimiento-cheese-39091
https://hostthetoast.com/cheesesteak-potato-skins/
https://www.food.com/recipe/president-obamas-american-kumara-mash-sweet-potatoes-482343
https://www.epicurious.com/recipes/food/views/ancho-pasilla-sauce-14146
https://www.epicurious.com/recipes/food/views/spicy-broccoli-rabe-with-parmesan-and-pine-nuts-51243810
https://www.food.com/recipe/roasted-vegetable-couscous-salad-with-harissa-

https://www.food.com/recipe/pheasant-on-rice-278949
https://www.food.com/recipe/orange-beef-with-noodles-98262
https://www.food.com/recipe/cream-cheese-filled-cupcakes-353687
https://www.food.com/recipe/cheesy-chicken-enchiladas-betty-crocker-279223
https://www.food.com/recipe/strawberry-rhubarb-pineapple-fruit-leather-dehydrator-314104
https://minimalistbaker.com/vegan-cheddar-jalapeno-biscuits/
https://www.epicurious.com/recipes/food/views/roasted-asparagus-with-balsamic-vinegar-12074
https://www.food.com/recipe/easy-chutney-chicken-kebabs-22736
https://www.food.com/recipe/porcupine-meatballs-18434
https://www.food.com/recipe/chicken-kale-soup-294525
https://www.foodnetwork.com/recipes/michael-chiarello/raw-corn-arugula-and-pecorino-salad-with-grilled-chicken-breast-recipe-1945386
https://www.foodrepublic.com/recipes/hedgehog-hot-dogs/
https://www.food.com/recipe/octoberfest-german-potato-salad-239063
https://www.food.com/recipe/cheesy-chicken-casserole-147083
https://www.epicurious.

https://www.southernliving.com/recipes/pumpkin-spice-heath-cake
https://www.food.com/recipe/nanas-flu-remedy-208947
https://www.food.com/recipe/cheesy-drop-biscuits-85494
https://www.foodnetwork.com/recipes/giada-de-laurentiis/parmesan-and-prosciutto-spiced-plums-3118969
https://www.food.com/recipe/pineapple-upside-down-cake-easy-way-427333
https://www.epicurious.com/recipes/food/views/chili-cornmeal-muffins-15669
https://www.food.com/recipe/gluten-free-pinwheel-cookies-348010
https://www.food.com/recipe/stir-fried-sirloin-steak-w-brown-rice-218325
https://www.food.com/recipe/easy-brisket-for-holidays-or-anyday-333441
https://www.mybakingaddiction.com/rolo-dream-bars-recipe/
Error encountered.econds.
https://www.epicurious.com/recipes/food/views/smoked-scotch-eggs-395691
https://www.food.com/recipe/zuries-chicken-liver-pat-195826
https://www.food.com/recipe/baihian-halibut-138428
https://www.food.com/recipe/red-cabbage-apple-and-caraway-soup-10438
https://www.food.com/recipe/gluten-fre

https://www.foodnetwork.com/recipes/rachael-ray/pinwheel-steaks-recipe-2013571
https://www.food.com/recipe/mixed-berry-shortcakes-for-low-carb-diets-81527
https://www.cookstr.com/recipes/onion-rice
https://www.food.com/recipe/banana-cream-cake-252100
https://www.food.com/recipe/cabbage-apple-and-almond-slaw-475994
https://www.food.com/recipe/sakinas-banana-cake-with-with-semolina-194258
https://www.epicurious.com/recipes/food/views/radicchio-salad-with-oranges-and-olives-241089
https://www.food.com/recipe/breakfast-couscous-194482
https://www.epicurious.com/recipes/food/views/ruffled-pasta-with-herbed-ricotta-and-pine-nut-brown-butter-107414
https://www.food.com/recipe/slow-cooker-chicken-curry-497341
https://www.food.com/recipe/danish-blackberry-jam-cake-427785
https://www.food.com/recipe/layered-ground-beef-casserole-274490
https://www.food.com/recipe/blackened-portobello-mushroom-salad-380946
https://copykat.com/panera-bread-honey-walnut-cream-cheese-spread/
https://www.food.com/rec

https://www.food.com/recipe/chipotles-guacamole-recipe-306376
https://www.food.com/recipe/nanas-banana-bread-97591
https://www.food.com/recipe/kidney-bean-chili-indian-style-rajma-in-masala-47936
https://www.food.com/recipe/cheeseburger-paradise-soup-206915
https://www.food.com/recipe/moms-most-requested-chocolate-pie-86306
https://www.epicurious.com/recipes/food/views/charred-corn-salad-with-basil-and-tomatoes-51104300
https://www.food.com/recipe/mint-basting-sauce-for-grilling-meats-232543
https://www.epicurious.com/recipes/food/views/cilantro-chipotle-tilapia-351295
https://www.epicurious.com/recipes/food/views/coconut-date-nut-balls-11611
https://www.food.com/recipe/grilled-steaks-with-berry-sauce-62520
https://www.food.com/recipe/cracked-canoe-danish-fondue-441266
https://www.gimmesomeoven.com/mexican-macaroni-cheese-recipe/
https://www.food.com/recipe/halibut-with-vegetable-confetti-22950
https://www.food.com/recipe/mashed-cheesy-cauliflower-169067
https://www.southernliving.com/

https://www.food.com/recipe/french-onion-soup-413148
https://www.food.com/recipe/paula-deens-brown-sugar-bacon-392970
https://www.food.com/recipe/halloween-candy-cookies-447954
https://www.foodnetwork.com/recipes/dave-lieberman/arugula-and-pear-salad-with-dijon-sherry-vinaigrette-recipe-1917096
https://www.food.com/recipe/pretty-quick-cajun-chicken-and-pasta-189656
https://www.food.com/recipe/super-moist-cake-mix-lemon-pound-cake-362231
https://www.epicurious.com/recipes/food/views/miso-carbonara-with-broccoli-rabe-and-red-pepper-flakes-51261020
https://www.food.com/recipe/easy-baked-whitefish-356347
https://www.food.com/recipe/sams-caesar-salad-278389
https://www.food.com/recipe/mojito-chicken-breast-476006
https://minimalistbaker.com/easy-muhammara-dip/
https://www.food.com/recipe/futomaki-big-sushi-roll-393029
https://www.epicurious.com/recipes/food/views/smashed-fingerling-potatoes-233409
https://www.mybakingaddiction.com/chocolate-eggnog-waffles/
https://www.food.com/recipe/cherry

https://www.epicurious.com/recipes/food/views/green-onion-casserole-13469
https://www.food.com/recipe/ez-chicken-pot-pie-465095
https://www.epicurious.com/recipes/food/views/winter-white-salad
https://www.food.com/recipe/carnival-melting-chocolate-cake-539695
https://www.food.com/recipe/jam-tarts-416658
https://www.lecremedelacrumb.com/trillionaire-bars/
https://www.closetcooking.com/reuben-cauliflower-corned-beef-hash/
https://www.food.com/recipe/chili-under-pressure-476590
https://www.southernliving.com/syndication/lemon-bar-cheesecake
https://www.food.com/recipe/amys-creamy-jalape-o-pimiento-cheese-spread-167435
https://www.epicurious.com/recipes/food/views/cranberry-and-celery-relish-356089
https://www.closetcooking.com/black-and-blue-steak-salad-with/
https://www.food.com/recipe/southern-style-chicken-and-dumplings-for-beginners-415845
https://www.food.com/recipe/shoepeg-corn-casserole-359064
https://www.food.com/recipe/cucumber-and-onion-salad-399388
https://www.food.com/recipe/s

https://www.mybakingaddiction.com/nutella-chocolate-chip-pumpkin-bread/
https://www.cookstr.com/recipes/curried-chicken-salad-with-apples-and-currants
https://www.food.com/recipe/turkish-meatballs-kofta-83793
https://www.epicurious.com/recipes/food/views/saffron-quinoa-with-dried-cherries-and-almonds
https://www.foodnetwork.com/recipes/anne-burrell/sauteed-mustard-greens-recipe-1922016
https://www.food.com/recipe/pammys-ground-beef-stroganoff-101648
https://www.epicurious.com/recipes/food/views/zucchini-wrapped-red-snapper-with-tomato-cumin-and-orange-sauce-10045
https://www.food.com/recipe/a-bit-of-italy-lemon-drop-cookies-481585
https://www.food.com/recipe/hot-cheese-beef-dip-27213
https://www.food.com/recipe/pizza-pot-pie-82278
https://www.food.com/recipe/lower-carb-pancakes-for-one-89493
https://www.food.com/recipe/chicken-pasta-salad-94805
https://www.food.com/recipe/chicken-peppers-stir-fry-4-ppv-494354
https://www.food.com/recipe/banana-chip-muffins-224321
https://www.epicurious

# Database Cleaning and Maintenance

In [4]:
# Print Overview
total = len(list(urls.find({})))
read = len(list(urls.find({"read":True})))
togo = len(list(urls.find({"read":False})))
errors = len(list(urls.find({"error":True})))
in_recipes = len(list(recipes.find({})))

print(f"URL Database: {total}")
print(f"Read: {read}")
print(f"To Read: {togo}")
print(f"Errors: {errors}")
print(f"Recipes Database: {in_recipes}")

URL Database: 226844
Read: 219960
To Read: 6884
Errors: 101
Recipes Database: 226630


### Duplicate Cleaning
At this stage should be true duplicates, if urls are found more than once it keeps first, which is arbitrary

In [53]:
# Remove Duplicates (true dups, everything is identical)
dups = pd.DataFrame(list(recipes.find({})))
dups = dups[["_id", "url"]].groupby("url", as_index=False).count()
dups = np.unique(dups[dups["_id"]>1]["url"])
for url in dups:
    delete = [x["_id"] for x in list(recipes.find({"url":url}))[1:]]
    for did in delete:
        recipes.delete_one({"_id":did})

### Marking Read
Depending on certai issues some documents don't get marked as read. If they are in recipes then they should be updated and this quick fix will clean that up

In [66]:
# Mark everything as read
read_urls = [x["url"] for x in list(recipes.find({}))]
for url in tqdm(read_urls):
    urls.update_one({"_id":url}, {"$set":{"read":True, "error":False}})

HBox(children=(FloatProgress(value=0.0, max=29245.0), HTML(value='')))


