In [96]:
import pandas as pd
import pymongo
import json
import numpy as np
import extruct
import requests
import re
from w3lib.html import get_base_url
from bs4 import BeautifulSoup
import random
import time

In [79]:
# Database setup
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
recipes = db['recipes']
source_ref = db['source_ref']

In [134]:
# Helper functions
def split_time(t):
    clean_t = t[2:]
    if 'H' in clean_t:
        split = clean_t.split("H")
        hours = split[0]
        minutes = split[1][:-1]
        return (60*int(hours)) + int(minutes)
    else:
        minutes = clean_t[:-1]
        return int(minutes)

def return_as_list(x):
    if type(x) == type(list()):
        return x
    else:
        return [x]
def wait():
    
    x = random.randrange(2, 8, 1)
    print(f"Waiting {x}", end='\r')
    time.sleep(x)
    
def clean_url(x):
    
    clean = x
    if "#" in clean:
        clean = clean.split("#")[0]
    if "?" in clean:
        clean = clean.split("?")[0]
    return clean

In [142]:
class RecipeCollector:
    
    def __init__(self, url, domain, source, rdb, sdb):
        
        # User Params
        self.base_url = url
        self.domain = domain
        self.source = source
        self.rdb = rdb
        self.sdb = sdb
        
        # Scraping Defaults
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
        
        # Dictionaries for checking links
        if len(list(sdb.find({"source":source}))) > 0:
            self.link_library =  self.link_library[0]["link_library"]
            self.scraped_recipes = self.link_library[0]["scraped_recipes"]
        else:
            self.link_library = {}
            self.scraped_recipes = []
            
        
    # Helper Functions
    def recursive_page_scrape(self, page):
        """
        Crawls across website and scrapes recipes as discovered.
        """
        wait()
        r = requests.get(page, headers=self.headers)
        soup = BeautifulSoup(r.content, "html.parser")
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)
        
        # Mark as being read
        print(f"Scraping {page}")
        self.link_library[page] = 1
        self.sdb.update_one({"source":self.source}, {"$set":{"link_library":self.link_library}})
        
        # Recipe
        self.scrape_recipe(data, page)
        print(f"Recipes Found: {len(self.scraped_recipes)}")
        
        # Look for all links on page
        for link_string in soup.findAll('a', attrs={'href': re.compile("^https://")}):
            link = clean_url(link_string.get('href'))
            if self.domain in link:
                if link not in self.link_library.keys():
                    self.link_library[link] = 0
                    self.recursive_page_scrape(link)
    
    def scrape_recipe(self, link_data, page):
        """
        Checks if given link_data contains a recipe and scrapes if it does.
        """
        if page not in self.scraped_recipes:
            recipe = {}
            self.recursive_recipe_lookup(link_data, recipe)

            if len(recipe) == 0:
                return False
            else:
                self.add_scraped_recipe(recipe["recipe_data"])
                self.scraped_recipes.append(page)
                self.sdb.update_one({"source":self.source}, {"$set":{"scraped_recipes":self.scraped_recipes}})
                return True
        else:
            return False
    
    def recursive_recipe_lookup(self, data, recipe):
        for key,value in data.items():
            if key == "@type":
                if value == "Recipe":
                    recipe["recipe_data"] = data
            if type(value) == type(dict()):
                recursive_lookup(value, recipe)
            elif type(value) == type(list()):
                for val in value:
                    if type(val) == type(str()):
                        pass
                    elif type(val) == type(list()):
                        pass
                    elif type(val) == type(tuple()):
                        pass
                    else:
                        recursive_lookup(val, recipe)
    
    def add_scraped_recipe(self, recipe_data):
        """
        Takes a schema.org scraped recipe, formats it and adds to database.
        """
        row = {}
        
        # Source
        row["source"] = self.source
        
        # Title
        row['title'] = recipe_data["name"] if "name" in recipe_data.keys() else None
        
        # Description
        row['description'] = recipe_data["description"] if "description" in recipe_data.keys() else None
        
        # Author
        row["author"] = recipe_data["author"]["name"] if "author" in recipe_data.keys() else None
        
        # Ingredients
        row["ingredients"] = recipe_data["recipeIngredient"] if "recipeIngredient" in recipe_data.keys() else None
        
        # url
        row["url"] = recipe_data["url"] if "url" in recipe_data.keys() else None
        
        # Times
        row["prepTime"] = split_time(recipe_data["prepTime"]) if "prepTime" in recipe_data.keys() else None
        row["cookTime"] = split_time(recipe_data["cookTime"]) if "cookTime" in recipe_data.keys() else None
        row["totalTime"] = split_time(recipe_data["totalTime"]) if "totalTime" in recipe_data.keys() else None
        
        # Date Published (left in format)
        row["datePublished"] = recipe_data["datePublished"] if "datePublished" in recipe_data.keys() else None
        
        # Yields
        row["recipeYield"] = return_as_list(recipe_data["recipeYield"])[0] if "recipeYield" in recipe_data.keys() else None

        # Category
        row["recipeCategory"] = return_as_list(recipe_data["recipeCategory"]) if "recipeCategory" in recipe_data.keys() else None
        
        # Cooking Method
        row["cookingMethod"] = return_as_list(recipe_data["cookingMethod"]) if "cookingMethod" in recipe_data.keys() else None

        # Cuisine
        row["recipeCuisine"] = return_as_list(recipe_data["recipeCuisine"]) if "recipeCuisine" in recipe_data.keys() else None
      
        # Ratings
        if 'aggregateRating' in recipe_data.keys():
            row["rating"] =  recipe_data["aggregateRating"]["ratingValue"]
            row["review_count"] = recipe_data["aggregateRating"]["reviewCount"]
        else:
            row["rating"] =  None
            row["review_count"] = None
            
        # Reviews (unstructured)
        row["reviews"] =recipe_data["review"] if "review" in recipe_data.keys() else None
        
        # Instructions
        if "recipeInstructions" in recipe_data.keys():
            if type(recipe_data["recipeInstructions"]) == type(list()):
                row["instructions"] = [x['text'] for x in recipe_data["recipeInstructions"]]
            else:
                row["instructions"] = recipe_data["recipeInstructions"]
        else:
            row["instructions"] = None
            
        # Keywords
        if "keywords" in recipe_data.keys():
            row["keywords"] = [x for x in recipe_data["keywords"].split(",")]
        else:
            row["keywords"] = None
            
        # Write into database
        self.rdb.insert_one(row)

In [143]:
crawler = RecipeCollector("http://www.formulatedflavors.com/", 
                          "formulatedflavors.com", 
                          "formulated_flavors", 
                          test, 
                          source_ref)
crawler.recursive_page_scrape("http://www.formulatedflavors.com/")

Scraping http://www.formulatedflavors.com/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/about-2/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/recipes/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/contact-2/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/sample-page/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/wp-admin/
Recipes Found: 0
Scraping http://www.formulatedflavors.com/wp-login.php
Recipes Found: 0
Scraping http://www.formulatedflavors.com/2020/07/27/bring-me-more-banh-mi-instant-pot/
Recipes Found: 1
Scraping http://www.formulatedflavors.com/wprm_print/recipe/182
Recipes Found: 1
Scraping http://www.formulatedflavors.com/author/formulatedflavors_vmnpdl/
Recipes Found: 1
Scraping http://www.formulatedflavors.com/category/main-course/
Recipes Found: 1
Scraping http://www.formulatedflavors.com/category/main-course/vietnamese/
Recipes Found: 1
Scraping http://www.formulatedflavors.com/2020/07/22/ultimatechocolate