In [1]:
# External Libraries
import pandas as pd
from recipe_scrapers import scrape_me
import time
import random
import json
import pymongo
from tqdm.notebook import tqdm
import numpy as np

In [2]:
# Scripts
from scripts import scrapers, parsers, db_funcs

In [3]:
# Db Information
urls, recipes = db_funcs.get_scraper_dbs()

# Scraper
Take the url database and start retrieving the actual recipe contents to store in a new database dedicated to the recipes. Given that this database will form the basis for an actual app and analysis the key unique id will be generated by Mongo and keep close tabs with the url database to ensure no read recipes are reread.

## Retriever Workflow

In [6]:
scrapers.scrape_unread_urls(urls, recipes)

No URL's to Read.


## Collector Workflow
Below is a set of crawlers dedicated to different recipe sources.

In [120]:
# Sallys Baking Addiction
crawler = scrapers.RecipeCollector("https://sallysbakingaddiction.com/", 
                          "sallysbakingaddiction.com", 
                          "sallys_baking_addiction", 
                          recipes, 
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://sallysbakingaddiction.com/")

Recipes Found: 1620 | Scraping https://sallysbakingaddiction.com/

In [5]:
# Real Simple
crawler = scrapers.RecipeCollector("https://realsimplegood.com/",
                          "realsimplegood.com",
                          "real_simple_good",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://realsimplegood.com/")

Recipes Found: 542 | Scraping https://realsimplegood.com/wprm_print/recipe/27858                                                                                                                        

In [19]:
# Pinch of Yum
crawler = scrapers.RecipeCollector("https://pinchofyum.com",
                          "pinchofyum.com",
                          "pinch_of_yum",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://pinchofyum.com")

Recipes Found: 8238 | Scraping https://pinchofyum.com                                                                                                                                                   

In [6]:
# Two Peas and their pod
crawler = scrapers.RecipeCollector("https://www.twopeasandtheirpod.com",
                          "https://www.twopeasandtheirpod.com",
                          "two_peas_and_their_pod",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
pages = ["https://www.twopeasandtheirpod.com/recipes/?fwp_paged="+str(i) for i in range(2, 147)]
for page in pages:
   # Crawling will resume more or less where it left off
    crawler.recursive_page_scrape(page) 

Recipes Found: 1723 | Scraping https://www.twopeasandtheirpod.com/recipes/?fwp_paged=146                                                                                                                

In [5]:
# Fox and Briar
crawler = scrapers.RecipeCollector("https://www.foxandbriar.com/",
                          "https://www.foxandbriar.com/",
                          "fox_and_briar",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.foxandbriar.com/")

Recipes Found: 346 | Scraping https://www.foxandbriar.com/page/46/                                                                                                                                      

In [6]:
# Salt and Baker
crawler = scrapers.RecipeCollector("https://saltandbaker.com/",
                          "https://saltandbaker.com/",
                          "salt_and_baker",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://saltandbaker.com/")

Recipes Found: 301 | Scraping https://saltandbaker.com/page/29/                                                                                                                                         

In [7]:
# Clean Eating Couple
crawler = scrapers.RecipeCollector("https://thecleaneatingcouple.com/",
                          "https://thecleaneatingcouple.com/",
                          "clean_eating_couple",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://thecleaneatingcouple.com/")

Recipes Found: 271 | Scraping https://thecleaneatingcouple.com/category/recipes/breakfast/page/13/                                                                                                      

In [9]:
# Dinner then Dessert
crawler = scrapers.RecipeCollector("https://dinnerthendessert.com/",
                          "https://dinnerthendessert.com/",
                          "dinner_then_dessert",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://dinnerthendessert.com/recipe-index")

Recipes Found: 1468 | Scraping https://dinnerthendessert.com/recent-recipes/page/2                                                                                                                      

In [9]:
# Salty Marshmellow
crawler = scrapers.RecipeCollector("https://thesaltymarshmallow.com/",
                          "https://thesaltymarshmallow.com/",
                          "the_salty_marshmellow",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://thesaltymarshmallow.com/")

Recipes Found: 499 | Scraping https://thesaltymarshmallow.com/category/dinner/page/20/                                                                                                                  

In [10]:
# A Farm Girl Dabbles
crawler = scrapers.RecipeCollector("https://www.afarmgirlsdabbles.com/",
                          "https://www.afarmgirlsdabbles.com/",
                          "a_farm_girl_dabbles",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.afarmgirlsdabbles.com/")

Recipes Found: 145 | Scraping https://www.afarmgirlsdabbles.com/downloads/MerryChristmasTags.pdf                                                                                                        

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Recipes Found: 268 | Scraping https://www.afarmgirlsdabbles.com/downloads/FourthOfJulyTags.pdf                                                                                                          

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Recipes Found: 407 | Scraping https://www.afarmgirlsdabbles.com/tag/zucchini/                                                                                                                           

In [42]:
# Cakes Cottage
crawler = scrapers.RecipeCollector("https://cakescottage.com/",
                          "https://cakescottage.com/",
                          "cakes_cottage",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://cakescottage.com/")

Recipes Found: 17 | Scraping https://cakescottage.com/                                                                                                                                                  

In [48]:
# Your Cup of Cake
crawler = scrapers.RecipeCollector("https://www.yourcupofcake.com/",
                          "https://www.yourcupofcake.com/",
                          "your_cup_of_cake",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.yourcupofcake.com/")

Recipes Found: 874 | Scraping https://www.yourcupofcake.com/category/yeast-breads                                                                                                                       entines-dayhttps:/www.yourcupofcake.com/category/holiday/valentines-dayhttps:/www.yourcupofcake.com/category/holiday/valentines-day/page/3/

In [55]:
# How Sweet Eats
crawler = scrapers.RecipeCollector("https://www.howsweeteats.com/",
                          "https://www.howsweeteats.com/",
                          "how_sweet_eats",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.howsweeteats.com/")

Recipes Found: 535 | Scraping https://www.howsweeteats.com/                                                                                                                                             

In [None]:
# Baker by Nature
crawler = scrapers.RecipeCollector("https://bakerbynature.com/",
                          "https://bakerbynature.com/",
                          "baker_by_nature",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://bakerbynature.com/")

Recipes Found: 278 | Scraping https://bakerbynature.com/nutella-cinnamon-sugar-doughnuts/                                                                                                               

In [None]:
# Feels Like Home Blog
crawler = scrapers.RecipeCollector("https://feelslikehomeblog.com/category/home-cooking",
                          "https://feelslikehomeblog.com/",
                          "feels_like_home_blog",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://feelslikehomeblog.com/category/home-cooking")

Recipes Found: 19 | Scraping https://www.foxandbriar.com/slow-cooker-beef-burrito-bowls/                                                                                                                

In [8]:
# Carlsbad Cravings
crawler = scrapers.RecipeCollector("https://carlsbadcravings.com/",
                          "https://carlsbadcravings.com/",
                          "carlsbad_cravings",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://carlsbadcravings.com/")

Recipes Found: 222 | Scraping https://carlsbadcravings.com/slow-cooker-salsa-verde-honey-lime-chicken-recipe/                                                                                           

JSONDecodeError: Expecting ',' delimiter: line 1 column 2109 (char 2108)

In [None]:
# Kitchn
crawler = scrapers.RecipeCollector("https://www.thekitchn.com/sitemap",
                          "https://www.thekitchn.com",
                          "the_kitchn",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.thekitchn.com/sitemap")

Recipes Found: 19 | Scraping https://www.foxandbriar.com/slow-cooker-beef-burrito-bowls/                                                                                                                

# Database Cleaning and Maintenance

In [5]:
# Print Overview
db_funcs.print_db_metadata(urls, recipes)

URL Database: 226844
Read: 226825
To Read: 19
Errors: 105
Recipes Database: 235817


### Duplicate Cleaning
At this stage should be true duplicates, if urls are found more than once it keeps first, which is arbitrary

In [7]:
db_funcs.recipes_clean_duplicates(recipes)

### Marking Read
Depending on certai issues some documents don't get marked as read. If they are in recipes then they should be updated and this quick fix will clean that up

In [66]:
# Mark everything as read
db_funcs.clean_read_recipes_urls()

HBox(children=(FloatProgress(value=0.0, max=29245.0), HTML(value='')))


