In [14]:
#import libraries
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import csv
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlparse
import os
import requests

## Getting Article Data

In [15]:
# getting article data
def get_recipe_name(soup):
    name = soup.find('h1', class_='article-heading') 
    return name.text.strip() if name else "No recipe name found"

def get_ingredients(soup):
    """
    Extract ingredients from the BeautifulSoup object.
    """
    ingredients_list = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
    ingredients_data = []
    
    for item in ingredients_list:
        quantity = item.find('span', {'data-ingredient-quantity': 'true'})
        unit = item.find('span', {'data-ingredient-unit': 'true'})
        name = item.find('span', {'data-ingredient-name': 'true'})
        
        ingredients_data.append({
            'quantity': quantity.text.strip() if quantity else None,
            'unit': unit.text.strip() if unit else None,
            'name': name.text.strip() if name else None,
        })
    
    return ingredients_data

def get_steps(soup):
    """
    Extract steps from the BeautifulSoup object as an ordered array.
    """
    # Find the ordered list (ol) containing the steps
    steps_list = soup.find('ol', {'id': 'mntl-sc-block_1-0'})
    
    # Check if the steps list is found
    if not steps_list:
        return []
    
    # Find all <li> elements within the <ol>
    step_items = steps_list.find_all('li')
    
    # Extract text from each <p> tag in the <li> elements
    steps = []
    for item in step_items:
        step_text = item.find('p')  # Find the <p> tag inside the <li>
        if step_text:
            steps.append(step_text.text.strip())  # Add the cleaned text to the list
    
    return steps

def get_image_src(soup):
    # First, try to get the image from the 'primary-image__media' class
    primary_image_div = soup.find('div', class_='primary-image__media')
    if primary_image_div:
        image_tag = primary_image_div.find('img')
        if image_tag and image_tag.get('src'):
            return image_tag.get('src')  # Return the 'src' of the primary image

    # If no primary-image__media is found, try to get the URL from the 'jw-preview' class    
    # Check if the video tag exists
    video_tag = soup.find('video')

    # Check if the video tag exists and extract the poster attribute
    if video_tag:
        # Try to get the 'poster' attribute, if it exists
        poster_url = video_tag.get('poster')

        # If 'poster' is not found, try 'data-poster'
        if not poster_url:
            poster_url = video_tag.get('data-poster')
        
        if poster_url:
            return poster_url   
        
    return None

In [16]:
def get_nutrition_data(soup):
    """
    Extracts both summary nutrition values (calories, fat, carbs, protein) and servings/calories
    from the nutrition section of the BeautifulSoup object.
    """
    nutrition_data = {}
    
    # Extract summary nutrition (calories, fat, carbs, protein) from the nutrition facts summary section
    nutrition_summary = {}
    nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
    
    if nutrition_table:
        rows = nutrition_table.find_all('tr', class_='mm-recipes-nutrition-facts-summary__table-row')
        
        for row in rows:
            value = row.find_all('td')[0].text.strip()  # Nutrient value (first <td>)
            name = row.find_all('td')[1].text.strip()   # Nutrient name (second <td>)
            
            if name.lower() in ['calories', 'fat', 'carbs', 'protein']:
                nutrition_summary[name.lower()] = value
    
    # Add the summary nutrition values to the nutrition_data dictionary
    nutrition_data.update(nutrition_summary)
    
    # Extract servings and calories from the main nutrition section
    servings_row = soup.find('tr', class_='mm-recipes-nutrition-facts-label__servings')
    if servings_row:
        servings_label = servings_row.find('span', class_='mm-recipes-nutrition-facts-label__table-head-pretext')
        if servings_label:
            servings_value = servings_label.find_next('span')  # Get the next sibling <span>
            if servings_value:
                nutrition_data['servings'] = servings_value.text.strip()
    
    calories_row = soup.find('tr', class_='mm-recipes-nutrition-facts-label__calories')
    if calories_row:
        calories_label = calories_row.find('span', class_='mm-recipes-nutrition-facts-label__table-head-pretext')
        if calories_label:
            calories_value = calories_label.find_next('span')  # Get the next sibling <span>
            if calories_value:
                nutrition_data['calories'] = calories_value.text.strip()
    
    return nutrition_data


In [17]:
def fetch_recipe_data(url):
    """
    Parent function to fetch and organize recipe data.
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    req = Request(url, headers=headers)
    
    # Fetch the page
    response = urlopen(req)
    html = response.read().decode('utf-8')
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Fetch data from child functions
    recipe_data = {
        'name': get_recipe_name(soup),
        'ingredients': get_ingredients(soup),
        'steps': get_steps(soup),
        'nutrition': get_nutrition_data(soup),
        'link': url,
        'image_url': get_image_src(soup)
    }
    
    return recipe_data

#### Example Usage

In [18]:
# To test feel free to take an example page like this and just paste it here

# url = 'https://www.allrecipes.com/air-fryer-baked-yams-recipe-8737640'
url = 'https://www.allrecipes.com/recipe/220619/real-homemade-bagels/'
# url = 'https://www.allrecipes.com/recipe/15683/dutch-apple-pie-with-oatmeal-streusel/'

# test with these urls

recipe_data = fetch_recipe_data(url)

print(recipe_data)

{'name': 'Real Homemade Bagels', 'ingredients': [{'quantity': '1 ¼', 'unit': 'cups', 'name': 'water'}, {'quantity': '4 ½', 'unit': 'cups', 'name': 'bread flour'}, {'quantity': '3', 'unit': 'tablespoons', 'name': 'white sugar'}, {'quantity': '1', 'unit': 'teaspoon', 'name': 'salt'}, {'quantity': '2', 'unit': 'tablespoons', 'name': 'vegetable oil'}, {'quantity': '1', 'unit': 'tablespoon', 'name': 'instant yeast'}, {'quantity': '4', 'unit': 'quarts', 'name': 'water'}, {'quantity': '1', 'unit': 'cup', 'name': 'honey (Optional)'}, {'quantity': '2', 'unit': 'tablespoons', 'name': 'poppy seeds (Optional)'}, {'quantity': '2', 'unit': 'tablespoons', 'name': 'sesame seeds (Optional)'}, {'quantity': '2', 'unit': 'tablespoons', 'name': 'dried onion flakes (Optional)'}, {'quantity': '1', 'unit': 'tablespoon', 'name': 'coarse salt (Optional)'}], 'steps': ['Make bagels: Combine water, flour, sugar, salt, vegetable oil, and yeast in the bowl of a stand mixer fitted with the dough hook. Mix on low spee

In [19]:
def print_recipe_formatted(recipe_data):
    print("\nRecipe Name:", recipe_data['name'])
    print("\nIngredients:")
    for ingredient in recipe_data['ingredients']:
        print(ingredient)
    print("\nSteps:")
    for i, step in enumerate(recipe_data['steps'], 1):
        print(f"{i}. {step}")
    print("\nNutrition Data:")
    for key, value in recipe_data['nutrition'].items():
        print(f"{key.capitalize()}: {value}")
    print(f"\nImage Preview: {recipe_data['image_url']}")
    print("-------------------------------------------")

In [20]:
recipe_data = fetch_recipe_data(url)

print_recipe_formatted(recipe_data)


Recipe Name: Real Homemade Bagels

Ingredients:
{'quantity': '1 ¼', 'unit': 'cups', 'name': 'water'}
{'quantity': '4 ½', 'unit': 'cups', 'name': 'bread flour'}
{'quantity': '3', 'unit': 'tablespoons', 'name': 'white sugar'}
{'quantity': '1', 'unit': 'teaspoon', 'name': 'salt'}
{'quantity': '2', 'unit': 'tablespoons', 'name': 'vegetable oil'}
{'quantity': '1', 'unit': 'tablespoon', 'name': 'instant yeast'}
{'quantity': '4', 'unit': 'quarts', 'name': 'water'}
{'quantity': '1', 'unit': 'cup', 'name': 'honey (Optional)'}
{'quantity': '2', 'unit': 'tablespoons', 'name': 'poppy seeds (Optional)'}
{'quantity': '2', 'unit': 'tablespoons', 'name': 'sesame seeds (Optional)'}
{'quantity': '2', 'unit': 'tablespoons', 'name': 'dried onion flakes (Optional)'}
{'quantity': '1', 'unit': 'tablespoon', 'name': 'coarse salt (Optional)'}

Steps:
1. Make bagels: Combine water, flour, sugar, salt, vegetable oil, and yeast in the bowl of a stand mixer fitted with the dough hook. Mix on low speed until dough

## Crawling

#### Scraping 1 collection

In [21]:
def get_recipe_urls(page_url):
    """
    Fetches all recipe URLs from the main page (the one containing a list of recipe links).
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    req = Request(page_url, headers=headers)
    
    # Fetch the page
    response = urlopen(req)
    html = response.read().decode('utf-8')
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all recipe links in the page (based on the mntl card item structure)
    recipe_links = []
    recipe_cards = soup.find_all('a', class_='mntl-card-list-items')  # This will depend on the actual class name

    for card in recipe_cards:
        link = card.get('href')
        if link and link not in recipe_links:
            recipe_links.append(link)
    
    return recipe_links

In [22]:
import time
def scrape_recipes_from_page(page_url):
    """
    Scrapes all recipes from a given page, fetches the data, and returns an array of recipe data.
    """
    recipe_data_list = []
    
    # Step 1: Get all recipe URLs from the main page
    recipe_urls = get_recipe_urls(page_url)
    
                # Parse the URL
    parsed_url = urlparse(page_url)
    path_parts = parsed_url.path.strip('/').split('/')
    last_part = path_parts[-1]  # Extract the last part of the URL
    
    # Print the count of recipe links found
    print(f"\r{last_part}: {len(recipe_urls)} recipe links", flush=True)
    
    counter = 1
    
    # Step 2: Iterate over the URLs and fetch the recipe data
    for recipe_url in recipe_urls:
        print(f"\r({counter}/ {len(recipe_urls)}) Fetching data for: {recipe_url}", end='', flush=True)
        
        # Fetch and organize the recipe data
        recipe_data = fetch_recipe_data(recipe_url)
        
        # Append the recipe data to the list
        recipe_data_list.append(recipe_data)
        
        counter += 1
        time.sleep(0.1)
        
    print()
    
    return recipe_data_list

## Saving to CSV

In [23]:
def save_recipes_to_csv(page_url, directory):
        # Parse the URL
    parsed_url = urlparse(page_url)
    path_parts = parsed_url.path.strip('/').split('/')
    last_part = path_parts[-1]  # Extract the last part of the URL
    
    os.makedirs(directory, exist_ok=True)

    # Dynamically generate the CSV filename
    csv_filename = os.path.join(directory, f"{last_part}-recipes.csv")
    
    # Initialize CSV writer
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['name', 'link', 'ingredients', 'steps', 'nutrition', 'image_url']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # recipe_links = get_recipe_urls(page_url)  # Assuming this function returns the list of recipe links

        # Loop through the recipe links
        for recipe_data in recipes:           
            # Ensure ingredients are all strings
            ingredients = ', '.join([str(ingredient) for ingredient in recipe_data['ingredients']])

            # Ensure steps are all strings
            steps = '. '.join([str(step) for step in recipe_data['steps']])

            # Ensure nutrition is a string, if it's a dictionary
            nutrition = ', '.join([f"{key}: {value}" for key, value in recipe_data['nutrition'].items()])

            # Write the recipe data to CSV
            writer.writerow({
                'name': recipe_data['name'],
                'link': recipe_data['link'],
                'ingredients': ingredients,
                'steps': steps,
                'nutrition': nutrition,
                'image_url': recipe_data['image_url']
            })
        print(f"Done Made a {csv_filename}")    
    


In [24]:
from collections import defaultdict

alphabets_url = 'https://www.allrecipes.com/recipes-a-z-6735880'

response = urlopen(alphabets_url)
soup_obj = BeautifulSoup(response, 'html.parser')

# Find all links on the main page
links = soup_obj.find_all('a', class_='mntl-link-list__link')

# Create a dictionary to store links grouped by initial letter
grouped_links = defaultdict(list)

for link in links:
    href = link.get('href')
    if href:  # Ensure the href is not None
        # Extract the last segment from the URL
        text = link.get_text(strip=True)
        first_letter = text[0].upper()
        grouped_links[first_letter].append(href)
                

# Print the grouped links
for letter, grouped in grouped_links.items():
    print(f"{letter}: {len(grouped)} links")
    for href in grouped:
        print(f"  {href}")

A: 8 links
  https://www.allrecipes.com/recipes/23070/everyday-cooking/cookware-and-equipment/air-fryer/
  https://www.allrecipes.com/recipes/16492/everyday-cooking/special-collections/allrecipes-allstars/
  https://www.allrecipes.com/recipes/385/desserts/cakes/angel-food-cake/
  https://www.allrecipes.com/recipes/102/appetizers-and-snacks/antipasto/
  https://www.allrecipes.com/recipes/76/appetizers-and-snacks/
  https://www.allrecipes.com/recipes/788/desserts/pies/apple-pie/
  https://www.allrecipes.com/recipes/1333/side-dish/applesauce/
  https://www.allrecipes.com/recipes/14913/appetizers-and-snacks/dips-and-spreads/artichoke-dip/
B: 31 links
  https://www.allrecipes.com/recipes/1537/bread/yeast-bread/bagels/
  https://www.allrecipes.com/recipes/1673/side-dish/beans-and-peas/baked-beans/
  https://www.allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/
  https://www.allrecipes.com/recipes/836/desserts/cookies/bar-cookies/
  https://www.allrecipes.com/recipes/200/

In [25]:
test_group = {
    'Y': grouped_links.get('Y'),
    'Z': grouped_links.get('Z')
}

def is_url_logged(log_file_path, url):
    # Open the log file and check if the URL exists in it
    with open(log_file_path, 'r', encoding='utf-8') as file:
        # Read all lines and check if the URL is in the file
        logged_urls = file.readlines()
        
        # Strip newline characters and compare the URL
        if any(url.strip() == logged_url.strip() for logged_url in logged_urls):
            return True
    return False

def count_lines_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        line_count = sum(1 for line in file)  # Sum up lines in the file
    return line_count

In [26]:
# what collection to scrape, i used just this as reference so be careful of other pages seeming different

scraper = "gavin"

for letter, grouped in grouped_links.items():
    
    print(f"\rGavin: {count_lines_in_file('logAtoN.txt')} | Jason: {count_lines_in_file('logOtoZ.txt')}", end='', flush=True)
    
    if scraper == "gavin":
        log_file = 'logAtoN.txt'
        if not 'A' <= letter <= 'N':
            break
        
    elif scraper == "jason":
        log_file = 'logOtoZ.txt'
        if not 'O' <= letter <= 'Z':
            continue
        
    counter = 1
    for href in grouped:
        
        print(f"\r\n{letter}: ({counter}/ {len(grouped)})", flush=True)
        counter += 1
        
        if is_url_logged(log_file, href):
            print(f"\rAlready scraped {href}")
            continue
        recipes = scrape_recipes_from_page(href)
        save_recipes_to_csv(href, f"{letter}-recipes")
            
        # Append the URL to the log.txt file
        with open(log_file, 'a', encoding='utf-8') as file:
            file.write(href + '\n')
    print(f"\nCompleted all of {letter}-recipes!")
    print("-----------------------------------------")
    time.sleep(1)


Gavin: 0 | Jason: 0
A: (1/ 8)
air-fryer: 71 recipe links
(26/ 71) Fetching data for: https://www.allrecipes.com/air-fryer-ham-and-cheese-wraps-recipe-836511885849659

KeyboardInterrupt: 