In [1]:
#import libraries
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import csv
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlparse
from urllib.parse import urljoin
import os
import requests

## Getting Article Data (Scraping)

In [2]:
def get_recipe_name(soup):
    name = soup.find('h1', class_='heading-1') 
    return name.text.strip() if name else "No recipe name found"


def get_ingredients(soup):
    ingredients_list = []

    # Regex pattern for capturing quantity and unit
    quantity_unit_pattern = re.compile(r"(\d+(?:\.\d+)?|½|¼|⅓|⅔)?\s*(\w+)?\s*(.*)")

    # Find all the list items in the ingredients list
    ingredient_items = soup.find_all('li', class_='ingredients-list__item')

    for item in ingredient_items:
        text = item.get_text(separator=" ", strip=True)
        quantity = None
        unit = None
        ingredient = None
        item_note = None

        # Match the quantity and unit
        match = quantity_unit_pattern.match(text)
        if match:
            quantity = match.group(1)  # Capture the numeric value or fraction
            unit = match.group(2)     # Capture the unit
            text = match.group(3).strip()  # Remaining text after quantity and unit

        # Extract item notes (if present)
        note_div = item.find('div', class_='ingredients-list__item-note')
        if note_div:
            item_note = note_div.get_text(strip=True)

        # Remove item note from the ingredient name if it exists
        if item_note and text.endswith(item_note):
            ingredient = text.replace(item_note, '').strip()
        else:
            ingredient = text

        # Append the parsed ingredient with the appropriate structure
        ingredients_list.append({
            'name': ingredient,  # Main ingredient name
            'quantity': quantity,      # Extracted quantity
            'unit': unit,              # Extracted unit
            'item_note': item_note     # Any additional notes
        })

    return ingredients_list




    
def get_steps(soup):
    # Find the method steps container
    steps_list = soup.find('ul', class_='method-steps__list')
    if not steps_list:
        print("Steps list not found!")
        return

    # Parse each step
    steps = []
    step_counter = 1
    for item in steps_list.find_all('li', class_='method-steps__list-item'):
        step_heading = f"Step {step_counter}: "
        step_text = item.find('div', class_='editor-content').get_text(strip=True)
        steps.append(step_heading + step_text)
        step_counter += 1
    
    return steps


def get_nutrition_data(soup):
    """
    Extract nutrition data from a BeautifulSoup object.
    Returns a dictionary of nutrition information.
    """
    nutrition_dict = {}
    
    # Locate the nutrition section
    nutrition_list = soup.find('ul', class_='nutrition-list')
    if not nutrition_list:
        return nutrition_dict  # Return empty if no nutrition data is found
    
    # Iterate through each nutrition item
    for item in nutrition_list.find_all('li', class_='nutrition-list__item'):
        key_element = item.find('span', class_='fw-600 mr-1')  # Bold text for the nutrient name
        if not key_element:
            continue
        key = key_element.get_text(strip=True)
        
        # Extract the value following the nutrient name
        value = item.get_text(strip=True).replace(key, '')  # Remove the key part
        nutrition_dict[key] = value
    
    return nutrition_dict

def get_serving_data(soup):
    serving = None
    # Find all <div> elements with the class "icon-with-text__children"
    dd_elements = soup.find_all('div', class_='icon-with-text__children')
    if len(dd_elements) >= 3:
        serving_text = dd_elements[2].text.strip()
        # Match the serving range (e.g., "4 – 6") or a single number
        serving_range_match = re.search(r'(\d+)\s*–\s*(\d+)', serving_text)
        if serving_range_match:
            serving = int(serving_range_match.group(1))
        else:
            serving = int(''.join(filter(str.isdigit, serving_text)))

    return serving

def get_image_url(soup):
    image_url = None
    media_div = soup.find('div', class_='post-header__image-container')
    if media_div:
        picture_tag = media_div.find('picture')
        if picture_tag:
            img_tag = picture_tag.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']
    return image_url

In [3]:
#head of getting recipe
def fetch_recipe_data(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    req = Request(url, headers=headers)
    response = urlopen(req)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    
   
    name= get_recipe_name(soup),
    ingredient= get_ingredients(soup),
    steps= get_steps(soup),
    nutrition=get_nutrition_data(soup),
    link=url
    serve=get_serving_data(soup)
    image_url=get_image_url(soup)
    
    
    return name,ingredient,steps,nutrition,link,serve,image_url

## Going Trough Recomendation (Crawling)

In [4]:
# Extract recommended article URLs from a given article (crawl)
def get_recommended_recipes(url):
    response = urlopen(url)
    soup_obj = BeautifulSoup(response, 'html.parser')

    # Initialize the list to store the recommended recipe URLs
    recommended_recipes = []

    # Update the selector to find links with the class for recipes (adjust as per your HTML structure)
    for a_tag in soup_obj.find_all('a', class_='link d-block', href=True):
        recommended_url = a_tag['href']
        
        # Convert relative URLs to absolute URLs
        full_url = urljoin(url, recommended_url)
        recommended_recipes.append(full_url)
    
    return recommended_recipes

In [5]:
#get the new recipes from the recomended section
def process_recommended_recipes(recipe_url, processed_urls, recipe_data, max_depth, step, base_url):
    if step < max_depth:
        try:
            recommended_recipes = get_recommended_recipes(recipe_url)
            print(f"From {recipe_url}, getting {len(recommended_recipes)} recommended links at depth {step}.")

            for rec_url in recommended_recipes:
                full_rec_url = urljoin(base_url, rec_url)
                if full_rec_url in processed_urls:
                    continue

                processed_urls.add(full_rec_url)
                print(f"Processing recommended article: {full_rec_url}")
                rectitle, recingredients, recstep, recnutrition, reclink,reserve,reimage_url = fetch_recipe_data(full_rec_url)
                recipe_data.append((rectitle, recingredients, recstep, recnutrition, reclink,reserve,reimage_url))

                process_recommended_recipes(full_rec_url, processed_urls, recipe_data, max_depth, step + 1, base_url)
        except Exception as e:
            print(f"Error processing {recipe_url}: {e}")
    else:
        return

## Implementation

In [6]:
# #main pipeline
# # def scrape_main_page(max_depth=10, max_recipes=5):
# def scrape_main_page(max_depth=0, max_recipes=1):
#     start_url = 'https://www.bbcgoodfood.com/recipes/collection/all-low-calorie'
#     base_url = 'https://www.bbcgoodfood.com'
#     response = urlopen(start_url)
#     soup = BeautifulSoup(response, 'html.parser')

#     urls = []  # to hold each collection
#     dynamic_list_div = soup.find('div', class_='dynamic-list dynamic-list--multi-column')
#     if dynamic_list_div:
#         list_items = dynamic_list_div.find_all('li', class_='dynamic-list__list-item')
#         for item in list_items:
#             link_tag = item.find('a', href=True)
#             if link_tag:
#                 urls.append(urljoin(base_url, link_tag['href']))

#     recipes_data = []
#     processed_urls = set()
#     recipe_count = 0  # Counter to limit to 5 recipes

#     for url in urls:  # loop through collections
#         response = urlopen(url)
#         soup = BeautifulSoup(response, 'html.parser')

#         recipes_section = soup.find('div', class_='post__content')
#         if recipes_section:
#             recipes = recipes_section.find_all('article', class_='card text-align-left card--horizontal card--inline')
#             print(f"Scraping the main page: Found {len(recipes)} recipes.")
            
#             for recipe in recipes:
#                 if recipe_count >= max_recipes:  # Stop once we have 5 recipes
#                     break

#                 title_tag = recipe.find('div', class_='card__section card__content')
#                 recipe_url = title_tag.find('a')['href']
#                 full_recipe_url = urljoin(base_url, recipe_url)

#                 if full_recipe_url in processed_urls:
#                     continue
                
#                 processed_urls.add(full_recipe_url)
#                 print(f"Processing article: {full_recipe_url}")
#                 title, ingredients, step, nutrition, link, serve, image_url = fetch_recipe_data(full_recipe_url)
#                 recipes_data.append((title, ingredients, step, nutrition, link, serve, image_url))
                
#                 process_recommended_recipes(full_recipe_url, processed_urls, recipes_data, max_depth, 0, base_url)

#                 recipe_count += 1
#             if recipe_count >= max_recipes:
#                 break

#     return recipes_data


#main pipeline
def scrape_main_page(max_depth=10):
    start_url = 'https://www.bbcgoodfood.com/recipes/collection/all-low-calorie'
    base_url = 'https://www.bbcgoodfood.com'
    response = urlopen(start_url)
    soup = BeautifulSoup(response, 'html.parser')

    urls = []#to hold each collection
    dynamic_list_div = soup.find('div', class_='dynamic-list dynamic-list--multi-column')
    if dynamic_list_div:
        list_items = dynamic_list_div.find_all('li', class_='dynamic-list__list-item')
        for item in list_items:
            link_tag = item.find('a', href=True)
            if link_tag:
                urls.append(urljoin(base_url, link_tag['href']))

    recipes_data = []
    processed_urls = set()

    for url in urls:#loop trough collections
        response = urlopen(url)
        soup = BeautifulSoup(response, 'html.parser')

        recipes_section = soup.find('div', class_='post__content')
        if recipes_section:
            recipes = recipes_section.find_all('article', class_='card text-align-left card--horizontal card--inline')
            print(f"Scraping the main page: Found {len(recipes)} recipes.")
            
            for recipe in recipes:
                title_tag = recipe.find('div', class_='card__section card__content')
                recipe_url = title_tag.find('a')['href']
                full_recipe_url = urljoin(base_url, recipe_url)

                if full_recipe_url in processed_urls:
                    continue
                
                processed_urls.add(full_recipe_url)
                print(f"Processing article: {full_recipe_url}")
                title, ingredients, step, nutrition, link, serve, image_url = fetch_recipe_data(full_recipe_url)
                recipes_data.append((title, ingredients, step, nutrition, link, serve, image_url))
                
                process_recommended_recipes(full_recipe_url, processed_urls, recipes_data, max_depth, 0, base_url)

    return recipes_data

In [7]:
# Write the data to a CSV file
def write_to_csv(data):
    # Clear the file by opening it in write mode and immediately closing it
    with open('BBCGoodFoodScrap.csv', mode='w', newline='', encoding='utf-8') as file:
        file.truncate()  # Clear content, if any

    # Open again to write data
    with open('BBCGoodFoodScrap.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Include the 'News Link' column in the header
        writer.writerow(['title', 'ingredients', 'step', 'nutrition', 'link', 'serve','image_url'])
        for row in data:
            # Check if any attribute in the row is None or empty
            if all(row):
                writer.writerow(row)


In [8]:
# Main function
def main():
    article_data = scrape_main_page()
    write_to_csv(article_data)

main() #call the main

Scraping the main page: Found 24 recipes.
Processing article: https://www.bbcgoodfood.com/recipes/salsa-verde-baked-eggs
From https://www.bbcgoodfood.com/recipes/salsa-verde-baked-eggs, getting 4 recommended links at depth 0.
Processing recommended article: https://www.bbcgoodfood.com/recipes/smoky-tomato-gazpacho
From https://www.bbcgoodfood.com/recipes/smoky-tomato-gazpacho, getting 4 recommended links at depth 1.
Processing recommended article: https://www.buysubscriptions.com/print/good-food-magazine-subscription?promo=GFBS125&utm_medium=brandsite&utm_source=goodfood.com&utm_campaign=trial_GFBS125&utm_content=footer-widget&style=brand
Steps list not found!
From https://www.buysubscriptions.com/print/good-food-magazine-subscription?promo=GFBS125&utm_medium=brandsite&utm_source=goodfood.com&utm_campaign=trial_GFBS125&utm_content=footer-widget&style=brand, getting 0 recommended links at depth 2.
Processing recommended article: https://immediate.onelink.me/OQap?af_web_dp=https%3A%2F%2F