In [1]:
import pandas as pd
from recipe_scrapers import scrape_me
import time
import random
import json
import pymongo

In [2]:
def wait():
    
    x = random.randrange(2, 8, 1)
    print(f"Waiting {x}", end='\r')
    time.sleep(x)

In [3]:
# Db Information
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
urls = db['urls']

In [4]:
recipes = db['recipes']

# Ongoing Sources
Sites with massive databases too large to scrape in one or two sessions.

### Food.com

In [None]:
# Build Topic List
try:
    topic_file = open('./scraped_urls/food_com_topics.json')
    topics = json.load(topic_file)
except:
    topics = {}
    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    for letter in alphabet:
        path = 'https://www.food.com/topic/'+letter
        topics[letter] = {}
        wait()

        for link in scrape_me(path).links():
            if 'topic/' in link['href']:
                topics[letter][link['href']] = 1

    with open('./scraped_urls/food_com_topics.json', 'w') as json_file:
        json.dump(topics, json_file)

In [None]:
# Easy Page structure
num_pages = 12
for letter in topics:
    for topic in topics[letter]:
        
        c_name = topic.split('www.food.com/topic/')[1]
        if topics[letter][topic] == 'end':
            print(topic + " is Complete.")
            continue
        else:
            start = topics[letter][topic]
            stop = topics[letter][topic]+num_pages

            print("Scraping "+topic + " From " + str(start) + " to " + str(stop))
            for i in range(topics[letter][topic], topics[letter][topic]+num_pages):
                wait()
                page = scrape_me(topic+"?pn="+str(i)).links()
                
                if len(page) < 40:
                    topics[letter][topic] = 'end'
                    break
                else:
                    for link in page:
                        if 'recipe/' in link['href']:
                            recipe = link['href']
                            
                            # Insert checks
                            if recipe in recipes:
                                types = list(urls.find({"_id":recipe}, {"_id":0, "type":1}))[0]['type']
                                if c_name not in types:
                                    types.append(c_name)
                                    query = {"_id":recipe}
                                    newvalues = {"$set":{"type":types}}
                                    urls.update_one(query, newvalues)
                            
                            else:
                                urls.insert_one({'_id':recipe,
                                  'name':"-".join(recipe.split("/")[-1].split("-")[:-1]),
                                  'read':False,
                                  'type':[c_name],
                                  'source':'food_com',
                                  'website_id':recipe.split("/")[-1].split("-")[-1]})
                                recipes.append(recipe)
                    
                    # Save incrementally to avoid not counting progress
                    topics[letter][topic] += 1
                    with open('./scraped_urls/food_com_topics.json', 'w') as json_file:
                        json.dump(topics, json_file)
        print(f"Total Scraped at {len(recipes)}.")
        print(f"Total in Database at {len(list(urls.find({'source':'food_com'})))}.")

# One-off Sites
Sites with low amounts of recipes able to scrape in one or two sessions

### Epicurious

In [14]:
source = 'epicurious'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [18]:
for i in range(2030, 2050):
    print(f"Page {i}/{2050}")
    wait()
    
    page = scrape_me('https://www.epicurious.com/search/?content=recipe&sort=mostReviewed&page='+str(i)).links()
    for link in page:
        if 'itemprop' in link.keys():
            recipe = "https://www.epicurious.com"+link['href']
            if recipe not in recipe_list:
                urls.insert_one({"_id":recipe,
                                 "name":link['title'],
                                 'read':False,
                                 'type':[],
                                 'source':source,
                                 'website_id':link['href'].split("/")[-1].split("-")[-1],
                                 'react_id':link['data-reactid']})
                recipe_list.append(recipe)
    print(f"Recipes Scraped: {len(recipe_list)}")

Page 2000/1800
Recipes Scraped: 36001
Page 2001/1800
Recipes Scraped: 36019
Page 2002/1800
Recipes Scraped: 36037
Page 2003/1800
Recipes Scraped: 36055
Page 2004/1800
Recipes Scraped: 36073
Page 2005/1800
Recipes Scraped: 36091
Page 2006/1800
Recipes Scraped: 36109
Page 2007/1800
Recipes Scraped: 36127
Page 2008/1800
Recipes Scraped: 36145
Page 2009/1800
Recipes Scraped: 36163
Page 2010/1800
Recipes Scraped: 36181
Page 2011/1800
Recipes Scraped: 36199
Page 2012/1800
Recipes Scraped: 36217
Page 2013/1800
Recipes Scraped: 36235
Page 2014/1800
Recipes Scraped: 36253
Page 2015/1800
Recipes Scraped: 36271
Page 2016/1800
Recipes Scraped: 36289
Page 2017/1800
Recipes Scraped: 36307
Page 2018/1800
Recipes Scraped: 36325
Page 2019/1800
Recipes Scraped: 36343
Page 2020/1800
Recipes Scraped: 36361
Page 2021/1800
Recipes Scraped: 36379
Page 2022/1800
Recipes Scraped: 36397
Page 2023/1800
Recipes Scraped: 36415
Page 2024/1800
Recipes Scraped: 36433
Page 2025/1800
Recipes Scraped: 36451
Page 2026/18

KeyboardInterrupt: 

### Host the Toast

In [21]:
source = 'host_the_toast'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [22]:
page = scrape_me("https://hostthetoast.com/recipes/").links()
categories = []
for link in page:
    if "category" in link["href"]:
        categories.append(link['href'])

In [26]:
for c in categories:
    c_name = c.split("/category/")[1][:-1]
    for i in range(1, 40):
        page = scrape_me(c+"page/"+str(i)).links()
        
        if len(page) < 35:
            break
            
        wait()
        for link in page:
            if 'rel' in link.keys():
                if "bookmark" in link["rel"]:
                    recipe = link['href']
                    if recipe not in recipe_list:
                        urls.insert_one({"_id":recipe,
                                         "name":recipe.split("hostthetoast.com/")[1][:-1],
                                         'read':False,
                                         'type':[c_name],
                                         'source':source})
                        recipe_list.append(recipe)
        print(f"Recipes Scraped: {len(recipe_list)}")

Recipes Scraped: 16
Recipes Scraped: 32
Recipes Scraped: 48
Recipes Scraped: 64
Recipes Scraped: 80
Recipes Scraped: 96
Recipes Scraped: 112
Recipes Scraped: 128
Recipes Scraped: 138
Recipes Scraped: 149
Recipes Scraped: 158
Recipes Scraped: 161
Recipes Scraped: 173
Recipes Scraped: 179
Recipes Scraped: 189
Recipes Scraped: 192
Recipes Scraped: 201
Recipes Scraped: 210
Recipes Scraped: 226
Recipes Scraped: 241
Recipes Scraped: 254
Recipes Scraped: 263
Recipes Scraped: 264
Recipes Scraped: 280
Recipes Scraped: 283
Recipes Scraped: 290
Recipes Scraped: 296
Recipes Scraped: 298
Recipes Scraped: 305
Recipes Scraped: 306
Recipes Scraped: 309
Recipes Scraped: 317
Recipes Scraped: 325
Recipes Scraped: 334
Recipes Scraped: 341
Recipes Scraped: 348
Recipes Scraped: 357
Recipes Scraped: 367
Recipes Scraped: 368
Recipes Scraped: 373
Recipes Scraped: 377
Recipes Scraped: 385
Recipes Scraped: 391
Recipes Scraped: 399
Recipes Scraped: 407
Recipes Scraped: 412
Recipes Scraped: 420
Recipes Scraped: 43

### 101 Cookbooks

In [38]:
source = '101_cookbooks'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]
categories = ['https://www.101cookbooks.com/whole_grain_recipes',
 'https://www.101cookbooks.com/wfpb',
 'https://www.101cookbooks.com/vegetarian_recipes',
 'https://www.101cookbooks.com/vegan_recipes',
 'https://www.101cookbooks.com/soups',
 'https://www.101cookbooks.com/sides',
 'https://www.101cookbooks.com/sandwiches',
 'https://www.101cookbooks.com/salads',
 'https://www.101cookbooks.com/pasta',
 'https://www.101cookbooks.com/quick_recipes',
 'https://www.101cookbooks.com/main_courses',
 'https://www.101cookbooks.com/low_carb_recipes',
 'https://www.101cookbooks.com/instant_pot_recipes',
 'https://www.101cookbooks.com/holiday_recipes',
 'https://www.101cookbooks.com/high_protein_recipes',
 'https://www.101cookbooks.com/gluten_free_recipes',
 'https://www.101cookbooks.com/drink_recipes',
 'https://www.101cookbooks.com/dinner_ideas',
 'https://www.101cookbooks.com/desserts',
 'https://www.101cookbooks.com/cookies',
 'https://www.101cookbooks.com/chocolate_recipes',
 'https://www.101cookbooks.com/breakfast_brunch',
 'https://www.101cookbooks.com/baked_goods',
 'https://www.101cookbooks.com/appetizers']

In [39]:
for c in categories:
    c_name = c.split("/")[-1]
    for i in range(1, 40):
        page = scrape_me(c+"/page/"+str(i)).links()
        
        if len(page) > 198:
            break
            
        wait()
        for i, link in enumerate(page[1:]):
            if link['href'] == page[i]['href']:
                recipe = link['href']
                
                if recipe in recipe_list:
                    query = {"_id":recipe}
                    types = list(urls.find(query))[0]["type"]
                    if c_name not in types:
                        types.append(c_name)
                        urls.update_one(query, {"$set":{"type":types}})   
                else:
                    name = recipe
                    if "archives" in recipe:
                        name = recipe.split("/archives/")[1][:-5]
                    else:
                        name = recipe.split(".com/")[1][:-1]
                        
                    urls.insert_one({"_id":recipe,
                                    "name":name,
                                    'read':False,
                                    'type':[c_name],
                                    'source':source})
                    recipe_list.append(recipe)
        print(f"Recipes Scraped: {len(recipe_list)}")

Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 122
Recipes Scraped: 129
Recipes Scraped: 138
Recipes Scraped: 148
Recipes Scraped: 157
Recipes Scraped: 164
Recipes Scraped: 172
Recipes Scraped: 176
Recipes Scraped: 179
Recipes Scraped: 188
Recipes Scraped: 193
Recipes Scraped: 199
Recipes Scraped: 207
Recipes Scraped: 217
Recipes Scraped: 224
Recipes Scraped: 231
Recipes Scraped: 237
Recipes Scraped: 243
Recipes Scraped: 249
Recipes Scraped: 258
Recipes Scraped: 264
Recipes Scraped: 268
Recipes Scraped: 271
Recipes Scraped: 278
Recipes Scraped: 286
Recipes Scraped: 293
Recipes Scraped: 303
Recipes Scraped: 313
Recipes Scraped: 323
Recipes Scraped: 333
Recipes Scraped: 339
Recipes Scraped: 346
Recipes Scraped: 356
Recipes Scraped: 364
Recipes Scraped: 373
Recipes Scraped: 380
Recipes Scrap

### Inspiralized

In [41]:
source = 'inspiralized'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [42]:
for i in range(1, 60):
    page = scrape_me("https://inspiralized.com/recipe-index/page/"+str(i)).links()
    wait()
    for link in page:
        if "data-id" in link.keys():
            recipe = link['href']
            if recipe not in recipe_list:
                urls.insert_one({"_id":recipe,
                                "name":recipe.split(".com/")[1][:-1],
                                'read':False,
                                'type':[],
                                'source':source,
                                "website_id":link["data-id"]})
                recipe_list.append(recipe)
    print(f"Recipes Scraped: {len(recipe_list)}")

Recipes Scraped: 18
Recipes Scraped: 30
Recipes Scraped: 42
Recipes Scraped: 54
Recipes Scraped: 66
Recipes Scraped: 78
Recipes Scraped: 90
Recipes Scraped: 102
Recipes Scraped: 114
Recipes Scraped: 126
Recipes Scraped: 138
Recipes Scraped: 150
Recipes Scraped: 162
Recipes Scraped: 174
Recipes Scraped: 186
Recipes Scraped: 198
Recipes Scraped: 210
Recipes Scraped: 222
Recipes Scraped: 234
Recipes Scraped: 246
Recipes Scraped: 258
Recipes Scraped: 270
Recipes Scraped: 282
Recipes Scraped: 294
Recipes Scraped: 306
Recipes Scraped: 318
Recipes Scraped: 330
Recipes Scraped: 342
Recipes Scraped: 354
Recipes Scraped: 366
Recipes Scraped: 378
Recipes Scraped: 390
Recipes Scraped: 402
Recipes Scraped: 414
Recipes Scraped: 426
Recipes Scraped: 438
Recipes Scraped: 450
Recipes Scraped: 462
Recipes Scraped: 474
Recipes Scraped: 486
Recipes Scraped: 498
Recipes Scraped: 510
Recipes Scraped: 514
Recipes Scraped: 514
Recipes Scraped: 514
Recipes Scraped: 514
Recipes Scraped: 514
Recipes Scraped: 514

### Jamie Oliver

In [63]:
source = 'jamie_oliver'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]
categories = [
 'https://www.jamieoliver.com/recipes/category/tesco-community-cookery-school/',
 'https://www.jamieoliver.com/recipes/category/dishtype/pasta-risotto',
 'https://www.jamieoliver.com/recipes/category/dishtype/salad/',
 'https://www.jamieoliver.com/recipes/category/dishtype/bread-doughs/',
 'https://www.jamieoliver.com/recipes/category/dishtype/curry/',
 'https://www.jamieoliver.com/recipes/category/dishtype/vegetable-sides/',
 'https://www.jamieoliver.com/recipes/category/dishtype/soup/',
 'https://www.jamieoliver.com/recipes/category/dishtype/antipasti/',
 'https://www.jamieoliver.com/recipes/category/dishtype/roast',
 'https://www.jamieoliver.com/recipes/category/dishtype/bbq-food/',
 'https://www.jamieoliver.com/recipes/category/dishtype/stews/',
 'https://www.jamieoliver.com/recipes/category/dishtype/pizza/',
 'https://www.jamieoliver.com/recipes/category/dishtype/sandwiches-wraps/',
 'https://www.jamieoliver.com/recipes/category/dishtype/cakes-tea-time-treats/',
 'https://www.jamieoliver.com/recipes/category/dishtype/pies-pastries/',
 'https://www.jamieoliver.com/recipes/category/dishtype/sauces-condiments/',
 'https://www.jamieoliver.com/recipes/category/dishtype/puddings-desserts/',
 'https://www.jamieoliver.com/recipes/category/dishtype/drinks/',
 'https://www.jamieoliver.com/recipes/cookie-recipes/',
 'https://www.jamieoliver.com/recipes/meatball-recipes/',
 'https://www.jamieoliver.com/recipes/muffin-recipes/',
 'https://www.jamieoliver.com/recipes/category/dishtype/pasta-bake/']

In [None]:
for c in categories:
    c_name = c[:-1].split("/")[-1]
    for i in range(1, 60):
        page = scrape_me(c+"?rec-page="+str(i)).links()
        
        if len(page) < 111:
            break
            
        wait()
        for link in page:
            if "id" in link.keys():
                if "gtm_recipe" in link["id"]:
                    recipe = "https://www.jamieoliver.com"+link['href']
                
                    if recipe in recipe_list:
                        query = {"_id":recipe}
                        types = list(urls.find(query))[0]["type"]
                        if c_name not in types:
                            types.append(c_name)
                            urls.update_one(query, {"$set":{"type":types}})   
                    else:
                        urls.insert_one({"_id":recipe,
                                        "name":recipe[:-1].split("/")[-1],
                                        'read':False,
                                        'type':[c_name],
                                        'source':source})
                        recipe_list.append(recipe)
        print(f"Recipes Scraped: {len(recipe_list)}")

### Kreme de la Krum

In [9]:
source = 'creme_de_la_crum'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [10]:
for i in range(1, 53):
    page = scrape_me("https://www.lecremedelacrumb.com/recipe-index//page/"+str(i)).links()
    wait()
    for link in page:
        for link in page:
            if 'rel' in link.keys():
                if "bookmark" in link["rel"]:
                    recipe = link['href']
                    if recipe not in recipe_list:
                        urls.insert_one({"_id":recipe,
                                        "name":recipe.split(".com/")[1][:-1],
                                        'read':False,
                                        'type':[],
                                        'source':source,})
                        recipe_list.append(recipe)
    print(f"Recipes Scraped: {len(recipe_list)}")

Recipes Scraped: 58
Recipes Scraped: 81
Recipes Scraped: 105
Recipes Scraped: 129
Recipes Scraped: 153
Recipes Scraped: 177
Recipes Scraped: 201
Recipes Scraped: 218
Recipes Scraped: 240
Recipes Scraped: 261
Recipes Scraped: 283
Recipes Scraped: 304
Recipes Scraped: 325
Recipes Scraped: 347
Recipes Scraped: 370
Recipes Scraped: 391
Recipes Scraped: 412
Recipes Scraped: 436
Recipes Scraped: 460
Recipes Scraped: 484
Recipes Scraped: 508
Recipes Scraped: 530
Recipes Scraped: 554
Recipes Scraped: 578
Recipes Scraped: 602
Recipes Scraped: 626
Recipes Scraped: 650
Recipes Scraped: 674
Recipes Scraped: 698
Recipes Scraped: 722
Recipes Scraped: 746
Recipes Scraped: 770
Recipes Scraped: 792
Recipes Scraped: 816
Recipes Scraped: 840
Recipes Scraped: 864
Recipes Scraped: 888
Recipes Scraped: 912
Recipes Scraped: 936
Recipes Scraped: 960
Recipes Scraped: 984
Recipes Scraped: 1008
Recipes Scraped: 1032
Recipes Scraped: 1056
Recipes Scraped: 1080
Recipes Scraped: 1104
Recipes Scraped: 1128
Recipes S

### Minimalist Baker

In [13]:
source = 'minamalist_baker'
recipe_list = [x["_id"] for x in list(urls.find({"source":source}, {"_id":1}))]

In [14]:
for i in range(1, 63):
    page = scrape_me("https://minimalistbaker.com/recipe-index/?fwp_paged="+str(i)).links()
    wait()
    for link in page:
        for link in page:
            if 'tabindex' in link.keys():
                recipe = link['href']
                if recipe not in recipe_list:
                    urls.insert_one({"_id":recipe,
                                        "name":recipe.split(".com/")[1][:-1],
                                        'read':False,
                                        'type':[],
                                        'source':source,})
                    recipe_list.append(recipe)
    print(f"Recipes Scraped: {len(recipe_list)}")

Recipes Scraped: 20
Recipes Scraped: 40
Recipes Scraped: 60
Recipes Scraped: 79
Recipes Scraped: 99
Recipes Scraped: 118
Recipes Scraped: 138
Recipes Scraped: 158
Recipes Scraped: 178
Recipes Scraped: 198
Recipes Scraped: 218
Recipes Scraped: 238
Recipes Scraped: 258
Recipes Scraped: 278
Recipes Scraped: 298
Recipes Scraped: 318
Recipes Scraped: 338
Recipes Scraped: 358
Recipes Scraped: 378
Recipes Scraped: 398
Recipes Scraped: 418
Recipes Scraped: 438
Recipes Scraped: 458
Recipes Scraped: 478
Recipes Scraped: 498
Recipes Scraped: 518
Recipes Scraped: 538
Recipes Scraped: 558
Recipes Scraped: 578
Recipes Scraped: 598
Recipes Scraped: 618
Recipes Scraped: 638
Recipes Scraped: 658
Recipes Scraped: 678
Recipes Scraped: 698
Recipes Scraped: 718
Recipes Scraped: 738
Recipes Scraped: 758
Recipes Scraped: 778
Recipes Scraped: 798
Recipes Scraped: 818
Recipes Scraped: 838
Recipes Scraped: 858
Recipes Scraped: 878
Recipes Scraped: 898
Recipes Scraped: 918
Recipes Scraped: 938
Recipes Scraped: 9

### Next One

In [15]:
list(urls.find({"source":source}))

[{'_id': 'https://minimalistbaker.com/instant-pot-chickpeas/',
  'name': 'instant-pot-chickpeas',
  'read': False,
  'type': [],
  'source': 'minamalist_baker'},
 {'_id': 'https://minimalistbaker.com/dairy-free-coffee-creamer/',
  'name': 'dairy-free-coffee-creamer',
  'read': False,
  'type': [],
  'source': 'minamalist_baker'},
 {'_id': 'https://minimalistbaker.com/quick-easy-recipes-30-minutes-or-less/',
  'name': 'quick-easy-recipes-30-minutes-or-less',
  'read': False,
  'type': [],
  'source': 'minamalist_baker'},
 {'_id': 'https://minimalistbaker.com/easy-avocado-aioli-oil-free/',
  'name': 'easy-avocado-aioli-oil-free',
  'read': False,
  'type': [],
  'source': 'minamalist_baker'},
 {'_id': 'https://minimalistbaker.com/1-pot-vegetable-green-curry/',
  'name': '1-pot-vegetable-green-curry',
  'read': False,
  'type': [],
  'source': 'minamalist_baker'},
 {'_id': 'https://minimalistbaker.com/easy-beet-falafel/',
  'name': 'easy-beet-falafel',
  'read': False,
  'type': [],
  'so

In [12]:
page = scrape_me("https://minimalistbaker.com/recipe-index/?fwp_paged=2").links()
page

[{'href': '#genesis-nav-primary', 'class': ['screen-reader-shortcut']},
 {'href': '#main-content', 'class': ['screen-reader-shortcut']},
 {'href': 'https://www.facebook.com/MinimalistBaker',
  'target': '_blank',
  'rel': ['noopener', 'noreferrer']},
 {'href': 'https://instagram.com/minimalistbaker/',
  'target': '_blank',
  'rel': ['noopener', 'noreferrer']},
 {'href': 'https://www.pinterest.com/minimalistbaker/',
  'target': '_blank',
  'rel': ['noopener', 'noreferrer']},
 {'href': 'https://twitter.com/minimalistbaker',
  'target': '_blank',
  'rel': ['noopener', 'noreferrer']},
 {'href': 'https://www.youtube.com/c/Minimalistbaker',
  'target': '_blank',
  'rel': ['noopener', 'noreferrer']},
 {'href': 'https://minimalistbaker.com/recipe-index/'},
 {'href': 'https://minimalistbaker.com/recipe-index/?fwp_special-diet=vegan'},
 {'href': 'https://minimalistbaker.com/recipe-index/?fwp_special-diet=gluten-free'},
 {'href': 'https://minimalistbaker.com/'},
 {'href': 'https://minimalistbaker

# Self-scraped

In [63]:
import extruct
import requests
import re
from w3lib.html import get_base_url
from bs4 import BeautifulSoup

In [64]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"
}

In [76]:
def get_links(url):

    r = requests.get(url, headers=HEADERS)
    base_url = get_base_url(r.text, r.url)
    soup = BeautifulSoup(r.content, "html.parser")
    links = []

    for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
        links.append(link.get('href'))

    return links

def add_links(ll, nlinks, domain):
    
    for link in nlinks:
        if domain in link:
            if link not in ll.keys():
                ll[link] = 0
    return ll

def get_all_links(url, domain):
    
    # Initialize
    link_library = {}
    link_library = add_links(link_library, get_links(url), domain)
            
    # Loop through until all links have been recorded
    more_pages = True
    while more_pages:
        remaining = {k:v for (k,v) in link_library.items() if v == 0}
        
        if len(remaining) < 1:
            more_pages = False
        else:
            for link in remaining:
                wait()
                new_links = get_links(link)
                link_library = add_links(link_library, new_links, domain)
                link_library[link] = 1
                print(len(link_library))
        print(link_libary)
    return link_library

In [None]:
class RecipeCollector:
    
    def __init__(url, domain):
        
        # User Params
        self.base_url = url
        self.domain = domain
        
        # Dictionaries for checking links
        self.link_library = {}
        self.recipe_library = {}
        
    
    # Helper Functions
    def get_links(self, page):
        """
        Returns all the links on a page
        """
        r = requests.get(page, headers=HEADERS)
        soup = BeautifulSoup(r.content, "html.parser")
        
        # Look for all links on page
        for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
            if link not in links:
                links.append(link.get('href'))

        return links
    
    def is_recipe(self, link_data):
        """
        Checks if a given link is a recipe
        """
        for item in link_data['json-ld']:
            if "@type" in item.keys():
                if item["@type"] == 'Recipe':
                    return True
        return False
    
    # Main Orchestrtation Function
    def Run(self):
        
        # Start with homepage
        add_links_to_library()
    
    
    def add_links_to_libary(self):

        # Get exhaustive list
        links = self.get_links(self.url)
        
        # Loop through and filter/mark as seen
        for link in links:
            
            # Check if valid          
            if self.domain in link:
                
                # Check if recipe
                if is_recipe(link):
                    link_library[link] = "recipe"
                    
                # Otherwise either add it or set it to
                else:
                    if link in self.link_library.keys():
                        link_library[link] = 1
                    else:
                        link_library[link] = 0

    def get_all_links(url, domain):

        # Initialize
        link_library = {}
        link_library = add_links(link_library, get_links(url), domain)

        # Loop through until all links have been recorded
        more_pages = True
        while more_pages:
            remaining = {k:v for (k,v) in link_library.items() if v == 0}

            if len(remaining) < 1:
                more_pages = False
            else:
                for link in remaining:
                    wait()
                    new_links = get_links(link)
                    link_library = add_links(link_library, new_links, domain)
                    link_library[link] = 1
                    print(len(link_library))
            print(link_libary)
        return link_library

get_all_links(url='https://sallysbakingaddiction.com/', domain='sallysbakingaddiction.com')

link_library

{k:v for (k,v) in test.items() if v > 1}

test = get_links('https://sallysbakingaddiction.com/')
test

links = []
for y in x:
    if "https://sallysbakingaddiction.com/" in y:
        links.append(y)

links

r = requests.get('https://sallysbakingaddiction.com/cookies-n-cream-cookies/', headers=HEADERS)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)
{k:v for (k,v) in data['json-ld'].items() if v == 'https://schema.org/'}

for item in data['json-ld']:
    if "@type" in item.keys():
        if item["@type"] == 'Recipe':
            print(item)

r = requests.get('https://noblepig.com/2019/06/roasted-garlic-bruschetta/', headers=HEADERS)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)
data['json-ld']
#BeautifulSoup(r.content, "html.parser")

data['json-ld'][0]['@type']