In [21]:
#import libraries
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import csv
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlparse
import os
import requests

## Getting Article Data

In [22]:
def get_recipe_name(soup):
    name = soup.find('h1', class_='heading-1') 
    return name.text.strip() if name else "No recipe name found"

def get_ingredients(soup):
        ingredients_list = []

        # Regex pattern to match quantity and unit (e.g., "120g", "1tsp", "400g can")
        quantity_unit_pattern = re.compile(r"(\d+(?:\.\d+)?)([a-zA-Z]+)?")

        # Find all the list items in the ingredients list
        ingredient_items = soup.find_all('li', class_='ingredients-list__item')

        for item in ingredient_items:
            text = item.get_text(strip=True)
            quantity = None
            unit = None
            ingredient = None
            item_note = None

            # Try to extract quantity and unit using regex
            match = quantity_unit_pattern.match(text)
            if match:
                quantity = match.group(1)  # Capture the numeric value
                unit = match.group(2)     # Capture the unit (if present)
                # Remove the quantity and unit from the text
                text = text[len(match.group(0)):].strip()

            # Extract ingredient note (if present)
            note_div = item.find('div', class_='ingredients-list__item-note')
            if note_div:
                item_note = note_div.get_text(strip=True)

            # Clean up the ingredient text
            ingredient = text
            if item_note:
                ingredient = ingredient.replace(item_note, '').strip()

            # Append the parsed ingredient
            ingredients_list.append({
                'ingredient': ingredient,
                'quantity': quantity,
                'unit': unit,
                'item_note': item_note
            })

        return ingredients_list
    
def get_steps(soup):
    # Find the method steps container
    steps_list = soup.find('ul', class_='method-steps__list')
    if not steps_list:
        print("Steps list not found!")
        return

    # Parse each step
    steps = []
    step_counter = 1
    for item in steps_list.find_all('li', class_='method-steps__list-item'):
        step_heading = f"Step {step_counter}: "
        step_text = item.find('div', class_='editor-content').get_text(strip=True)
        steps.append(step_heading + step_text)
        step_counter += 1
    
    return steps


def get_nutrition_data(soup):
    """
    Extract nutrition data from a BeautifulSoup object.
    Returns a dictionary of nutrition information.
    """
    nutrition_dict = {}
    
    # Locate the nutrition section
    nutrition_list = soup.find('ul', class_='nutrition-list')
    if not nutrition_list:
        return nutrition_dict  # Return empty if no nutrition data is found
    
    # Iterate through each nutrition item
    for item in nutrition_list.find_all('li', class_='nutrition-list__item'):
        key_element = item.find('span', class_='fw-600 mr-1')  # Bold text for the nutrient name
        if not key_element:
            continue
        key = key_element.get_text(strip=True)
        
        # Extract the value following the nutrient name
        value = item.get_text(strip=True).replace(key, '')  # Remove the key part
        nutrition_dict[key] = value
    
    return nutrition_dict

In [23]:
def fetch_recipe_data(url):
    """
    Parent function to fetch and organize recipe data.
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    req = Request(url, headers=headers)
    
    # Fetch the page
    response = urlopen(req)
    html = response.read().decode('utf-8')
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Fetch data from child functions
    recipe_data = {
        'name': get_recipe_name(soup),
        'ingredients': get_ingredients(soup),
        'steps': get_steps(soup),
        'nutrition': get_nutrition_data(soup),
        'link': url
    }
    
    return recipe_data

In [None]:
def main_scrap():
    main_url="https://www.bbcgoodfood.com/recipes/collection/500-calorie-meals-recipes"
    return

In [24]:
test = fetch_recipe_data('https://www.bbcgoodfood.com/recipes/chicken-satay-salad')

print(f'Name: {test['name']}')

print(f'\nLink: {test['link']}')

print("\nIngredients:")
for ing in test['ingredients']:
    print(ing)
    
print("\nNutrition Data:")
for key, value in test['nutrition'].items():
    print(f"{key}: {value}")
    
print("\nSteps:")
for steps in test['steps']:
    print(steps)

Name: Chicken satay salad

Link: https://www.bbcgoodfood.com/recipes/chicken-satay-salad

Ingredients:
{'ingredient': 'tbsptamari', 'quantity': '1', 'unit': None, 'item_note': None}
{'ingredient': 'tspmedium curry powder', 'quantity': '1', 'unit': None, 'item_note': None}
{'ingredient': '¼ tspground cumin', 'quantity': None, 'unit': None, 'item_note': None}
{'ingredient': 'clove', 'quantity': '1', 'unit': 'garlic', 'item_note': 'finely grated'}
{'ingredient': 'tspclear honey', 'quantity': '1', 'unit': None, 'item_note': None}
{'ingredient': 'chicken breast fillets', 'quantity': '2', 'unit': 'skinless', 'item_note': '(or use turkey breast)'}
{'ingredient': 'tbspcrunchy peanut butter', 'quantity': '1', 'unit': None, 'item_note': '(choose a sugar-free version with no palm oil, if possible)'}
{'ingredient': 'tbspsweet chilli sauce', 'quantity': '1', 'unit': None, 'item_note': None}
{'ingredient': 'tbsplime juice', 'quantity': '1', 'unit': None, 'item_note': None}
{'ingredient': 'sunflower 