In [126]:
import requests
from bs4 import BeautifulSoup
import json
import sqlite3
import pandas as pd


CREATE SQLite DATABASE

In [127]:
# Connect to the SQLite database
recipe_conn = sqlite3.connect('recipes.db')
c = recipe_conn.cursor()


In [128]:
# Create recipes table if it doesn't exist
c.execute('''CREATE TABLE IF NOT EXISTS recipes
             (id INTEGER PRIMARY KEY AUTOINCREMENT,
             name TEXT,
             author TEXT,
             description TEXT,
             serving_capacity TEXT,
             prep_time TEXT,
             total_ingredients TEXT,
             nutrition TEXT,
             instructions TEXT)''')

<sqlite3.Cursor at 0x7b7c5dc54ac0>

EXTRACT RECIPE INFORMATION

In [129]:
def extract_recipe_info(url):
    try:
        url=url[0:3]
        res = requests.get(url).text
        soup = BeautifulSoup(res, "html.parser")

        script_tag = soup.find("script", {"type": "application/ld+json"})
        if not script_tag:
            return None

        script_content = script_tag.string.strip()
        recipe_data = json.loads(script_content)

        name = recipe_data.get("name", "")
        author = recipe_data.get("author", "")
        description = recipe_data.get("description", "")
        recipe_yield = recipe_data.get("recipeYield", "")
        prep_time_element = soup.find("dt", class_="facts__label", string="Ready In:")
        prep_time = prep_time_element.find_next_sibling("dd", class_="facts__value").get_text(strip=True) if prep_time_element else None
        total_ingredients_element = soup.find("dt", class_="facts__label", string="Ingredients:")
        total_ingredients=total_ingredients_element.find_next_sibling("dd", class_="facts__value").get_text(strip=True) if total_ingredients_element else None
        nutrition = recipe_data.get("nutrition", {}).get("calories", "")
        ingredients_list = []
        ingredient_items = soup.find_all("li", class_="style__ingredientsListItem__1s9da")
        for item in ingredient_items:
            quantity = item.find("span", class_="ingredient-quantity").text.strip()
            ingredient_text = item.find("span", class_="ingredient-text").text.strip()
            ingredients_list.append(f"{quantity} {ingredient_text}")

        instructions = []
        if "recipeInstructions" in recipe_data:
            instructions = [step.get("text", "") for step in recipe_data["recipeInstructions"] if step.get("@type") == "HowToStep"]
        return {
            "name": name,
            "author": author,
            "description": description,
            "serving_capacity": recipe_yield,
            "prep_time": prep_time,
            "total_ingredients": total_ingredients,
            "ingredients": ingredients_list,
            "nutrition": nutrition,
            "instructions": instructions
        }
    except Exception as e:
        print(f"Error extracting information from {url}: {e}")
        return None

In [130]:
def write_to_csv(recipe_info_list):
    df = pd.DataFrame(recipe_info_list)
    df.to_csv("recipes.csv", index=False, encoding='utf-8')

INSERT RECIPE INFORMATION TO THE TABLE

In [131]:
recipe_info_list = []
def insert_recipe_info(recipe_info):
    if recipe_info is not None:
        try:
            c.execute('''INSERT INTO recipes
                         (name, author, description, serving_capacity, prep_time, total_ingredients, nutrition, instructions)
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
                      (recipe_info['name'], recipe_info['author'], recipe_info['description'],
                       recipe_info['serving_capacity'], recipe_info['prep_time'], recipe_info['total_ingredients'],
                       recipe_info['nutrition'], '\n'.join(recipe_info['instructions'])))
            recipe_conn.commit()
            recipe_info_list.append({
                "Name": recipe_info['name'],
                "Author": recipe_info['author'],
                "Description": recipe_info['description'],
                "Serving Capacity": recipe_info['serving_capacity'],
                "Prep Time": recipe_info['prep_time'],
                "Total Ingredients": recipe_info['total_ingredients'],
                "Nutrition": recipe_info['nutrition'],
                "Instructions": '\n'.join(recipe_info['instructions'])
            })
        except Exception as e:
            print(f"Error inserting {recipe_info['name']} into database: {e}")


MAIN FUNCTION TO SCRAP RECIPES

In [132]:
def scrape_recipes(url):
    all_sitemap_links=[]
    all_type_recipe_links=[]
    try:
        sitemap_url = "https://www.food.com/html-sitemap"
        res = requests.get(sitemap_url)
        if res.status_code == 200:
          soup = BeautifulSoup(res.content, 'html.parser')
          sitemap_cols = soup.find_all("div", class_="sitemap-col")
          for cols in sitemap_cols:
            all_links = cols.find_all('a')
            all_sitemap_links.extend(all_links)
          for link in all_sitemap_links:
             href = link.get('href')
             all_type_recipe_links.append(href)
          for category_link in all_type_recipe_links:
                if not category_link.startswith("https://www.food.com/how-to")and not category_link.startswith ('https://www.food.com/about'):
                  res = requests.get(category_link).text
                  soup = BeautifulSoup(res, "html.parser")
                  button_link = soup.find("a", class_="button--primary")
                  if button_link:
                      button_url = button_link["href"]
                      res = requests.get(button_url).text
                      soup = BeautifulSoup(res, "html.parser")
                      smart_cards = soup.find_all('div', class_='smart-card')
                      for smart_card in smart_cards:
                          hyperlink_element = smart_card.find('a', href=True)
                          if hyperlink_element:
                              recipe_url = hyperlink_element['href']
                              recipe_info = extract_recipe_info(recipe_url)
                              insert_recipe_info(recipe_info)
                  else:
                          # If button link is not present, handle the smart-collection-inner section
                          smart_collections = soup.find_all('div', class_='smart-collection-inner container-md')
                          for smart_collection in smart_collections:
                              smart_cards = smart_collection.find_all('div', class_='smart-card')
                              for smart_card in smart_cards:
                                  hyperlink_element = smart_card.find('a', href=True)
                                  if hyperlink_element:
                                      recipe_url = hyperlink_element['href']
                                      recipe_info = extract_recipe_info(recipe_url)
                                      insert_recipe_info(recipe_info)

    except Exception as e:
        print(f"Error scraping recipes: {e}")


RUN THE SCRAPPING PROCESS

In [133]:
if __name__ == "__main__":
    base_url = "https://www.food.com"
    scrape_recipes(base_url)
    write_to_csv(recipe_info_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'name': "Jo Mama's Beef Stew", 'author': 'Sharlene~W', 'description': "Hearty and delicious. My family's favorite stew for over 30 years.  If you prefer, you can substitute basil for the rosemary--I fix it both ways.  Update:  When I originally made this recipe, I would coat the beef pieces with a flour/salt/pepper mix by shaking them in a bag and then brown them in a couple tablespoons of oil. I eliminated this step to simplify and lower the calories and fat, but if you do it that way it makes for a thicker sauce. Good either way!", 'serving_capacity': '6 serving(s)', 'prep_time': '1hr 5mins', 'total_ingredients': '10', 'ingredients': [], 'nutrition': '314.4', 'instructions': ['Shake stew meat up in a bag with flour to cover.', 'Brown in vegetable oil in stew pot.', 'Cover with water and bring to a boil.', 'Simmer for 30 minutes or more.  (I like to simmer until the beef is nice and tender).', 'Add remaining ingredients

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
c.execute('SELECT * FROM recipes')
rows=c.fetchall()
for row in rows:
  print(row)

In [None]:
c.execute("SELECT COUNT(*) FROM recipes")
num_entries = c.fetchone()[0]


Number of entries: 0
