In [1]:
import pandas as pd
import pymongo
import json
import numpy as np
import extruct
import requests
import re
from w3lib.html import get_base_url
from bs4 import BeautifulSoup
import random
import time
from tqdm.notebook import tqdm

In [2]:
# Database setup
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
recipes = db['recipes']
source_ref = db['source_ref']

In [3]:
# Helper functions
def split_time(t):
    clean_t = t[2:]
    if 'H' in clean_t:
        split = clean_t.split("H")
        hours = split[0]
        minutes = split[1][:-1]
        return (60*int(hours)) + int(minutes)
    else:
        minutes = clean_t[:-1]
        return int(minutes)

def return_as_list(x):
    if type(x) == type(list()):
        return x
    else:
        return [x]
def wait():
    
    x = random.randrange(2, 4, 1)
    #print(f"Waiting {x}", end='\r')
    time.sleep(x)
    
def clean_url(x, utm_pages=False):
    
    clean = x
    if "#" in clean:
        clean = clean.split("#")[0]
    if not utm_pages:
        if "?" in clean:
            clean = clean.split("?")[0]
            return clean
    return clean

def check_link(link, domain):
    
    stopwords = [".jpg", ".png", "wprm_print", "wprm-print", "wp-content", "comment-page"]
    
    if not link.startswith(domain):
        return False
    for word in stopwords:
        if word in link:
            return False
    return True

In [4]:
class RecipeCollector:
    
    def __init__(self, url, domain, source, rdb, sdb, utm_pages=False):
        
        # User Params
        self.base_url = url
        self.domain = domain
        self.source = source
        self.rdb = rdb
        self.sdb = sdb
        self.utm_pages = utm_pages
        
        # Scraping Defaults
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
        
        # Dictionaries for checking links
        if len(list(sdb.find({"_id":source}))) > 0:
            src_ref = list(sdb.find({"_id":source}))[0]
            self.link_library =  src_ref["link_library"]
            self.scraped_recipes = src_ref["scraped_recipes"]
        else:
            self.link_library = {}
            self.scraped_recipes = []
            self.sdb.insert_one({"_id":source,
                                 "link_library":self.link_library,
                                 "scraped_recipes":self.scraped_recipes})
            
        
    # Helper Functions
    def recursive_page_scrape(self, page):
        """
        Crawls across website and scrapes recipes as discovered.
        """
        # Mark as being read
        print_message = f"Recipes Found: {len(self.scraped_recipes)} | Scraping {page}"
        print(print_message.ljust(200, " "), end="\r", flush=True)
        self.link_library[page] = 1
        self.sdb.update_one({"_id":self.source}, {"$set":{"link_library":self.link_library}})
        
        # Scrape it, if it errors out it won't be tried again
        r = requests.get(page, headers=self.headers)
        soup = BeautifulSoup(r.content, "html.parser")
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)
        
        # Recipe
        self.scrape_recipe(data, page)
        
        # Look for all links on page
        for link_string in soup.findAll('a', attrs={'href': re.compile("^https://")}):
            link = clean_url(link_string.get('href'), self.utm_pages)
            if check_link(link, self.domain):
                if link not in self.link_library.keys():
                    wait()
                    self.recursive_page_scrape(link)
    
    def scrape_recipe(self, link_data, page):
        """
        Checks if given link_data contains a recipe and scrapes if it does.
        """
        if page not in self.scraped_recipes:
            recipe = {}
            self.recursive_recipe_lookup(link_data, recipe)

            if len(recipe) == 0:
                return False
            else:
                self.add_scraped_recipe(recipe["recipe_data"])
                self.scraped_recipes.append(page)
                self.sdb.update_one({"_id":self.source}, {"$set":{"scraped_recipes":self.scraped_recipes}})
                return True
        else:
            return False
    
    def recursive_recipe_lookup(self, data, recipe):
        for key,value in data.items():
            if key == "@type":
                if value == "Recipe":
                    recipe["recipe_data"] = data
            if type(value) == type(dict()):
                self.recursive_recipe_lookup(value, recipe)
            elif type(value) == type(list()):
                for val in value:
                    if type(val) == type(str()):
                        pass
                    elif type(val) == type(list()):
                        pass
                    elif type(val) == type(tuple()):
                        pass
                    else:
                        self.recursive_recipe_lookup(val, recipe)
    
    def add_scraped_recipe(self, recipe_data):
        """
        Takes a schema.org scraped recipe, formats it and adds to database.
        """
        row = {}
        
        # Source
        row["source"] = self.source
        
        # Title
        row['title'] = recipe_data["name"] if "name" in recipe_data.keys() else None
        
        # Description
        row['description'] = recipe_data["description"] if "description" in recipe_data.keys() else None
        
        # Author
        row["author"] = recipe_data["author"]["name"] if "author" in recipe_data.keys() else None
        
        # Ingredients
        row["ingredients"] = recipe_data["recipeIngredient"] if "recipeIngredient" in recipe_data.keys() else None
        
        # url
        row["url"] = recipe_data["url"] if "url" in recipe_data.keys() else None
        
        # Times
        row["prepTime"] = recipe_data["prepTime"] if "prepTime" in recipe_data.keys() else None
        row["cookTime"] = recipe_data["cookTime"] if "cookTime" in recipe_data.keys() else None
        row["totalTime"] = recipe_data["totalTime"] if "totalTime" in recipe_data.keys() else None
        
        # Date Published (left in format)
        row["datePublished"] = recipe_data["datePublished"] if "datePublished" in recipe_data.keys() else None
        
        # Yields
        row["recipeYield"] = return_as_list(recipe_data["recipeYield"])[0] if "recipeYield" in recipe_data.keys() else None

        # Category
        row["recipeCategory"] = return_as_list(recipe_data["recipeCategory"]) if "recipeCategory" in recipe_data.keys() else None
        
        # Cooking Method
        row["cookingMethod"] = return_as_list(recipe_data["cookingMethod"]) if "cookingMethod" in recipe_data.keys() else None

        # Cuisine
        row["recipeCuisine"] = return_as_list(recipe_data["recipeCuisine"]) if "recipeCuisine" in recipe_data.keys() else None
      
        # Ratings
        if 'aggregateRating' in recipe_data.keys():
            row["rating"] =  recipe_data["aggregateRating"]["ratingValue"] if "ratingValue" in recipe_data["aggregateRating"].keys() else None
            row["review_count"] = recipe_data["aggregateRating"]["reviewCount"] if "reviewCount" in recipe_data["aggregateRating"].keys() else None
        else:
            row["rating"] =  None
            row["review_count"] = None
            
        # Reviews (unstructured)
        row["reviews"] =recipe_data["review"] if "review" in recipe_data.keys() else None
        
        # Instructions
        if "recipeInstructions" in recipe_data.keys():
            try:
                row["instructions"] = [x['text'] for x in recipe_data["recipeInstructions"]]
            except:
                row["instructions"] = recipe_data["recipeInstructions"]
        else:
            row["instructions"] = None
            
        # Keywords
        if "keywords" in recipe_data.keys():
            row["keywords"] = [x for x in recipe_data["keywords"].split(",")]
        else:
            row["keywords"] = None
            
        # Write into database
        self.rdb.insert_one(row)

# The Crawlers

In [120]:
# Sallys Baking Addiction
crawler = RecipeCollector("https://sallysbakingaddiction.com/", 
                          "sallysbakingaddiction.com", 
                          "sallys_baking_addiction", 
                          recipes, 
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://sallysbakingaddiction.com/")

Recipes Found: 1620 | Scraping https://sallysbakingaddiction.com/

In [5]:
# Real Simple
crawler = RecipeCollector("https://realsimplegood.com/",
                          "realsimplegood.com",
                          "real_simple_good",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://realsimplegood.com/")

Recipes Found: 542 | Scraping https://realsimplegood.com/wprm_print/recipe/27858                                                                                                                        

In [19]:
# Pinch of Yum
crawler = RecipeCollector("https://pinchofyum.com",
                          "pinchofyum.com",
                          "pinch_of_yum",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://pinchofyum.com")

Recipes Found: 8238 | Scraping https://pinchofyum.com                                                                                                                                                   

In [6]:
# Two Peas and their pod
crawler = RecipeCollector("https://www.twopeasandtheirpod.com",
                          "https://www.twopeasandtheirpod.com",
                          "two_peas_and_their_pod",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
pages = ["https://www.twopeasandtheirpod.com/recipes/?fwp_paged="+str(i) for i in range(2, 147)]
for page in pages:
   # Crawling will resume more or less where it left off
    crawler.recursive_page_scrape(page) 

Recipes Found: 1723 | Scraping https://www.twopeasandtheirpod.com/recipes/?fwp_paged=146                                                                                                                

In [5]:
# Fox and Briar
crawler = RecipeCollector("https://www.foxandbriar.com/",
                          "https://www.foxandbriar.com/",
                          "fox_and_briar",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.foxandbriar.com/")

Recipes Found: 346 | Scraping https://www.foxandbriar.com/page/46/                                                                                                                                      

In [6]:
# Salt and Baker
crawler = RecipeCollector("https://saltandbaker.com/",
                          "https://saltandbaker.com/",
                          "salt_and_baker",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://saltandbaker.com/")

Recipes Found: 301 | Scraping https://saltandbaker.com/page/29/                                                                                                                                         

In [7]:
# Clean Eating Couple
crawler = RecipeCollector("https://thecleaneatingcouple.com/",
                          "https://thecleaneatingcouple.com/",
                          "clean_eating_couple",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://thecleaneatingcouple.com/")

Recipes Found: 271 | Scraping https://thecleaneatingcouple.com/category/recipes/breakfast/page/13/                                                                                                      

In [9]:
# Dinner then Dessert
crawler = RecipeCollector("https://dinnerthendessert.com/",
                          "https://dinnerthendessert.com/",
                          "dinner_then_dessert",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://dinnerthendessert.com/recipe-index")

Recipes Found: 1468 | Scraping https://dinnerthendessert.com/recent-recipes/page/2                                                                                                                      

In [9]:
# Salty Marshmellow
crawler = RecipeCollector("https://thesaltymarshmallow.com/",
                          "https://thesaltymarshmallow.com/",
                          "the_salty_marshmellow",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://thesaltymarshmallow.com/")

Recipes Found: 499 | Scraping https://thesaltymarshmallow.com/category/dinner/page/20/                                                                                                                  

In [10]:
# A Farm Girl Dabbles
crawler = RecipeCollector("https://www.afarmgirlsdabbles.com/",
                          "https://www.afarmgirlsdabbles.com/",
                          "a_farm_girl_dabbles",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.afarmgirlsdabbles.com/")

Recipes Found: 145 | Scraping https://www.afarmgirlsdabbles.com/downloads/MerryChristmasTags.pdf                                                                                                        

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Recipes Found: 268 | Scraping https://www.afarmgirlsdabbles.com/downloads/FourthOfJulyTags.pdf                                                                                                          

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Recipes Found: 407 | Scraping https://www.afarmgirlsdabbles.com/tag/zucchini/                                                                                                                           

In [42]:
# Cakes Cottage
crawler = RecipeCollector("https://cakescottage.com/",
                          "https://cakescottage.com/",
                          "cakes_cottage",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://cakescottage.com/")

Recipes Found: 17 | Scraping https://cakescottage.com/                                                                                                                                                  

In [48]:
# Your Cup of Cake
crawler = RecipeCollector("https://www.yourcupofcake.com/",
                          "https://www.yourcupofcake.com/",
                          "your_cup_of_cake",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.yourcupofcake.com/")

Recipes Found: 874 | Scraping https://www.yourcupofcake.com/category/yeast-breads                                                                                                                       entines-dayhttps:/www.yourcupofcake.com/category/holiday/valentines-dayhttps:/www.yourcupofcake.com/category/holiday/valentines-day/page/3/

In [55]:
# How Sweet Eats
crawler = RecipeCollector("https://www.howsweeteats.com/",
                          "https://www.howsweeteats.com/",
                          "how_sweet_eats",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.howsweeteats.com/")

Recipes Found: 535 | Scraping https://www.howsweeteats.com/                                                                                                                                             

In [None]:
# Baker by Nature
crawler = RecipeCollector("https://bakerbynature.com/",
                          "https://bakerbynature.com/",
                          "baker_by_nature",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://bakerbynature.com/")

Recipes Found: 278 | Scraping https://bakerbynature.com/nutella-cinnamon-sugar-doughnuts/                                                                                                               

In [None]:
# Feels Like Home Blog
crawler = RecipeCollector("https://feelslikehomeblog.com/category/home-cooking",
                          "https://feelslikehomeblog.com/",
                          "feels_like_home_blog",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://feelslikehomeblog.com/category/home-cooking")

Recipes Found: 19 | Scraping https://www.foxandbriar.com/slow-cooker-beef-burrito-bowls/                                                                                                                

In [8]:
# Carlsbad Cravings
crawler = RecipeCollector("https://carlsbadcravings.com/",
                          "https://carlsbadcravings.com/",
                          "carlsbad_cravings",
                          recipes,
                          source_ref,
                          utm_pages=False)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://carlsbadcravings.com/")

Recipes Found: 222 | Scraping https://carlsbadcravings.com/slow-cooker-salsa-verde-honey-lime-chicken-recipe/                                                                                           

JSONDecodeError: Expecting ',' delimiter: line 1 column 2109 (char 2108)

In [None]:
# Kitchn
crawler = RecipeCollector("https://www.thekitchn.com/sitemap",
                          "https://www.thekitchn.com",
                          "the_kitchn",
                          recipes,
                          source_ref,
                          utm_pages=True)

# Crawling will resume more or less where it left off
crawler.recursive_page_scrape("https://www.thekitchn.com/sitemap")

Recipes Found: 19 | Scraping https://www.foxandbriar.com/slow-cooker-beef-burrito-bowls/                                                                                                                

# DB Cleaning

In [49]:
# Setup
source = "your_cup_of_cake"
unique_cols = ["title", "datePublished"]
select_cols = [x for x in unique_cols]
select_cols.append("_id")

In [50]:
# Determine Dups
df = pd.DataFrame(list(recipes.find({"source":source})))
print(f"Recipes Found: {len(df)}")

df = df[select_cols].groupby(unique_cols, as_index=False)
df = df["_id"].apply(list).reset_index(name="ids")

Recipes Found: 874


In [51]:
# Remove the duplicates
for index in tqdm(df.index):
    if len(df.loc[index, 'ids']) > 1:
        delete = [x for x in df.loc[index, 'ids'][1:]]
        for did in delete:
            recipes.delete_one({"_id":did})

HBox(children=(FloatProgress(value=0.0, max=567.0), HTML(value='')))




In [52]:
print(f"Recipes After de-dup: {len(list(recipes.find({'source':source})))}")

Recipes After de-dup: 567


In [53]:
# by Title Alone
unique_cols = ["title"]
select_cols = [x for x in unique_cols]
select_cols.append("_id")
# Determine Dups
df = pd.DataFrame(list(recipes.find({"source":source})))
print(f"Recipes Found: {len(df)}")

df = df[select_cols].groupby(unique_cols, as_index=False)
sum(df.count()["_id"])

Recipes Found: 567


567