# <center>**Ejercicio en clase: Web Scraping**</center>
# **Autor:** David Calahorrano

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL raíz del sitio web
root_url = "https://www.allrecipes.com/"

# Función para obtener categorías
def get_categories(root_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(root_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        category_links = soup.find_all("a", href=True)
        categories = {
            link.text.strip(): link["href"]
            for link in category_links
            if "/recipes/" in link["href"] and link.text.strip()
        }
        return categories
    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {root_url}: {e}")
        return {}

# Función para obtener recetas de una categoría
def get_recipes_from_category(category_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(category_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        recipe_links = soup.find_all("a", href=True)
        recipes = list(set(
            link["href"]
            for link in recipe_links
            if "/recipe/" in link["href"]
        ))
        return recipes
    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {category_url}: {e}")
        return []

# Función para obtener detalles de una receta
def get_recipe_details(recipe_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(recipe_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        recipe_id = recipe_url.split("/")[-2]
        title = soup.find("meta", {"property": "og:title"})
        title = title["content"] if title else "Sin título"

        ingredients_section = soup.find_all("li", class_="ingredients-item")
        ingredients = [item.get_text(strip=True) for item in ingredients_section] if ingredients_section else []

        instructions_section = soup.find_all("li", class_="instructions-section-item")
        instructions = [step.get_text(strip=True) for step in instructions_section] if instructions_section else []

        prep_time = soup.find("div", class_="recipe-meta-item-body")
        prep_time = prep_time.get_text(strip=True) if prep_time else "No disponible"

        return {
            "id": recipe_id,
            "title": title,
            "ingredients": ingredients,
            "instructions": instructions,
            "prep_time": prep_time,
            "url": recipe_url
        }
    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {recipe_url}: {e}")
        return None

# Obtener todas las categorías
categories = get_categories(root_url)

# Lista para almacenar los datos
all_recipes = []
max_recipes = 500

# Iterar sobre las categorías
for category_name, category_url in categories.items():
    print(f"Procesando categoría: {category_name}")
    recipes = get_recipes_from_category(category_url)

    for recipe_url in recipes:
        recipe_details = get_recipe_details(recipe_url)
        if recipe_details:
            all_recipes.append(recipe_details)

        # Detener si alcanzamos el límite
        if len(all_recipes) >= max_recipes:
            break
    if len(all_recipes) >= max_recipes:
        break

# Crear un DataFrame y guardarlo en CSV
df = pd.DataFrame(all_recipes)
df.to_csv("recipes_corpus.csv", index=False, encoding="utf-8")
print("Corpus guardado en 'recipes_corpus.csv'.")

Procesando categoría: Dinners
Procesando categoría: 5-Ingredient Dinners
Procesando categoría: One-Pot Meals
Procesando categoría: Quick & Easy
Procesando categoría: 30-Minute Meals
Procesando categoría: Family Dinners
Procesando categoría: Soups, Stews & Chili
Procesando categoría: Comfort Food
Procesando categoría: Main Dishes
Procesando categoría: Sheet Pan Dinners
Procesando categoría: View All
Procesando categoría: Breakfast & Brunch
Procesando categoría: Lunch
Procesando categoría: Healthy
Procesando categoría: Appetizers & Snacks
Procesando categoría: Salads
Corpus guardado en 'recipes_corpus.csv'.


In [2]:
import requests
from bs4 import BeautifulSoup

# URL raíz del sitio web
root_url = "https://www.allrecipes.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Función para obtener categorías y sus URLs
def get_categories(root_url):
    try:
        response = requests.get(root_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Buscar enlaces relacionados con categorías
        category_links = soup.find_all("a", href=True)
        categories = {}

        for link in category_links:
            category_name = link.text.strip()
            category_url = link["href"]

            if "/recipes/" in category_url and category_name:
                categories[category_name] = category_url

        if not categories:
            print("No se encontraron categorías en la página raíz.")
        return categories

    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {root_url}: {e}")
        return {}

# Función para obtener recetas de una categoría
def get_recipes_from_category(category_url):
    try:
        response = requests.get(category_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Buscar enlaces de recetas
        recipe_links = soup.find_all("a", href=True)
        recipes = set()

        for link in recipe_links:
            recipe_url = link["href"]
            if "/recipe/" in recipe_url:
                recipes.add(recipe_url)

        if not recipes:
            print(f"No se encontraron recetas en la categoría: {category_url}")
        return recipes

    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {category_url}: {e}")
        return set()

# Construir el corpus con recetas y un límite de 500
def build_corpus(categories, max_recipes=500):
    all_recipes = set()
    recipe_corpus = []
    recipe_id = 1

    for category_name, category_url in categories.items():
        print(f"Extrayendo recetas de la categoría: {category_name}")
        recipes = get_recipes_from_category(category_url)

        for recipe_url in recipes:
            if len(all_recipes) >= max_recipes:
                break
            if recipe_url not in all_recipes:
                all_recipes.add(recipe_url)

                # Obtener nombre de la receta (manejando posibles errores)
                recipe_name = recipe_url.split("/")[-2].replace("-", " ").capitalize()
                if not recipe_name:
                    recipe_name = "Receta sin nombre"

                # Agregar al corpus
                recipe_corpus.append({
                    "id": recipe_id,
                    "name": recipe_name,
                    "url": recipe_url
                })
                recipe_id += 1

        if len(all_recipes) >= max_recipes:
            break

    return recipe_corpus

# Ejecución del código
if __name__ == "__main__":
    print("Obteniendo categorías...")
    categories = get_categories(root_url)

    if categories:
        print(f"Categorías encontradas: {len(categories)}")
        for category, url in categories.items():
            print(f"{category}: {url}")

        print("\nConstruyendo el corpus de recetas...")
        corpus = build_corpus(categories)
        print(f"\nTotal de recetas en el corpus: {len(corpus)}")

        # Mostrar las primeras 10 recetas como ejemplo
        for recipe in corpus[:10]:
            print(f"ID: {recipe['id']}, Nombre: {recipe['name']}, URL: {recipe['url']}")
    else:
        print("No se encontraron categorías.")

Obteniendo categorías...
Categorías encontradas: 56
Dinners: https://www.allrecipes.com/recipes/17562/dinner/
5-Ingredient Dinners: https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/
One-Pot Meals: https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/
Quick & Easy: https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/
30-Minute Meals: https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/
Family Dinners: https://www.allrecipes.com/recipes/17889/everyday-cooking/family-friendly/family-dinners/
Soups, Stews & Chili: https://www.allrecipes.com/recipes/94/soups-stews-and-chili/
Comfort Food: https://www.allrecipes.com/recipes/16099/everyday-cooking/comfort-food/
Main Dishes: https://www.allrecipes.com/recipes/80/main-dish/
Sheet Pan Dinners: https://www.allrecipes.com/recipes/22992/everyday-cooking/sheet-pan-dinners/
View All: https://www.allrecipes.com/recipes/85/holid

In [3]:
import csv

# Función para guardar las recetas en un archivo CSV
def save_recipes_to_csv(corpus, filename="recipes.csv"):
    try:
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=["id", "name", "url"])
            writer.writeheader()  # Escribir los encabezados del CSV
            writer.writerows(corpus)  # Escribir las recetas en el archivo
        print(f"Recetas guardadas exitosamente en {filename}.")
    except Exception as e:
        print(f"Error al guardar el archivo CSV: {e}")

In [4]:
# Guardar las recetas en el archivo CSV
save_recipes_to_csv(corpus)

Recetas guardadas exitosamente en recipes.csv.


In [5]:
# Función para cargar recetas desde un archivo CSV
def load_recipes_from_csv(filename="recipes.csv"):
    try:
        recipes = []
        with open(filename, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                recipes.append(row)
        print(f"Recetas cargadas desde {filename}. Total: {len(recipes)}")
        return recipes
    except Exception as e:
        print(f"Error al cargar el archivo CSV: {e}")
        return []

In [6]:
# Cargar recetas desde el archivo CSV
loaded_recipes = load_recipes_from_csv()
for recipe in loaded_recipes[:10]:  # Mostrar las primeras 10 recetas
    print(f"ID: {recipe['id']}, Nombre: {recipe['name']}, URL: {recipe['url']}")

Recetas cargadas desde recipes.csv. Total: 500
ID: 1, Nombre: Juicy roasted chicken, URL: https://www.allrecipes.com/recipe/83557/juicy-roasted-chicken/
ID: 2, Nombre: To die for fettuccine alfredo, URL: https://www.allrecipes.com/recipe/23431/to-die-for-fettuccine-alfredo/
ID: 3, Nombre: Juicy thanksgiving turkey, URL: https://www.allrecipes.com/recipe/166160/juicy-thanksgiving-turkey/
ID: 4, Nombre: Empanadas beef turnovers, URL: https://www.allrecipes.com/recipe/215231/empanadas-beef-turnovers/
ID: 5, Nombre: Cast iron pan seared steak oven finished, URL: https://www.allrecipes.com/recipe/262181/cast-iron-pan-seared-steak-oven-finished/
ID: 6, Nombre: Turkey in a smoker, URL: https://www.allrecipes.com/recipe/16984/turkey-in-a-smoker/
ID: 7, Nombre: Roasted pork loin, URL: https://www.allrecipes.com/recipe/21766/roasted-pork-loin/
ID: 8, Nombre: Chicken parmesan, URL: https://www.allrecipes.com/recipe/223042/chicken-parmesan/
ID: 9, Nombre: Salisbury steak, URL: https://www.allrecip

In [14]:
def get_recipe_details(recipe_url):
    try:
        response = requests.get(recipe_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Obtener el nombre de la receta (suponiendo que está en el título de la página)
        recipe_name = soup.find("h1", class_="headline").text.strip() if soup.find("h1", class_="headline") else "Receta sin nombre"

        # Obtener la descripción de la receta
        description = soup.find("div", class_="intro").text.strip() if soup.find("div", class_="intro") else "Descripción no disponible"

        # Obtener los ingredientes
        ingredients = []
        ingredient_list = soup.find_all("span", class_="ingredients-item-name")
        for ingredient in ingredient_list:
            ingredients.append(ingredient.text.strip())

        # Obtener las instrucciones de la receta
        instructions = []
        instruction_steps = soup.find_all("li", class_="subcontainer instructions-section-item")
        for step in instruction_steps:
            instructions.append(step.text.strip())

        # Obtener el tiempo de preparación (si está disponible)
        prep_time = soup.find("span", class_="prepTime__item--time").text.strip() if soup.find("span", class_="prepTime__item--time") else "Tiempo de preparación no disponible"

        # Retornar un diccionario con los detalles completos de la receta
        return {
            "name": recipe_name,
            "url": recipe_url,
            "description": description,
            "ingredients": ingredients,
            "instructions": instructions,
            "prep_time": prep_time
        }

    except requests.exceptions.RequestException as e:
        print(f"Error al conectar con {recipe_url}: {e}")
        return {}

In [15]:
def build_corpus(categories, max_recipes=500):
    all_recipes = set()
    recipe_corpus = []
    recipe_id = 1

    for category_name, category_url in categories.items():
        print(f"Extrayendo recetas de la categoría: {category_name}")
        recipes = get_recipes_from_category(category_url)

        for recipe_url in recipes:
            if len(all_recipes) >= max_recipes:
                break
            if recipe_url not in all_recipes:
                all_recipes.add(recipe_url)

                # Obtener los detalles de la receta
                recipe_details = get_recipe_details(recipe_url)
                if recipe_details:  # Asegúrate de que la receta tenga detalles
                    recipe_corpus.append({
                        "id": recipe_id,
                        "name": recipe_details["name"],
                        "url": recipe_details["url"],
                        "description": recipe_details["description"],
                        "ingredients": recipe_details["ingredients"],
                        "instructions": recipe_details["instructions"],
                        "prep_time": recipe_details["prep_time"],
                        "category": category_name
                    })
                    recipe_id += 1

        if len(all_recipes) >= max_recipes:
            break

    return recipe_corpus

In [16]:
import csv

def save_recipes_to_csv(corpus, filename="complete_recipes.csv"):
    try:
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            fieldnames = ["id", "name", "category", "url", "description", "ingredients", "instructions", "prep_time"]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()  # Escribir los encabezados del CSV

            for recipe in corpus:
                # Convertir listas de ingredientes e instrucciones a un formato de texto legible
                recipe["ingredients"] = ", ".join(recipe["ingredients"])
                recipe["instructions"] = " | ".join(recipe["instructions"])
                writer.writerow(recipe)  # Escribir cada receta al archivo CSV
        print(f"Recetas completas guardadas exitosamente en {filename}.")
    except Exception as e:
        print(f"Error al guardar el archivo CSV: {e}")

In [17]:
from google.colab import files

# Descargar el archivo CSV
files.download("complete_recipes.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# Función para cargar recetas completas desde un archivo CSV
def load_recipes_from_csv(filename="complete_recipes.csv"):
    try:
        recipes = []
        with open(filename, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                recipes.append(row)
        print(f"Recetas completas cargadas desde {filename}. Total: {len(recipes)}")
        return recipes
    except Exception as e:
        print(f"Error al cargar el archivo CSV: {e}")
        return []

In [11]:
from google.colab import files

# Después de guardar el archivo CSV
save_recipes_to_csv(corpus)

# Descargar el archivo CSV
files.download("complete_recipes.csv")

Recetas completas guardadas exitosamente en complete_recipes.csv.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
print(f"Total de recetas recopiladas: {len(corpus)}")
if len(corpus) > 0:
    save_recipes_to_csv(corpus)
else:
    print("No se han recopilado recetas.")

Total de recetas recopiladas: 500
Recetas completas guardadas exitosamente en complete_recipes.csv.


In [13]:
# Imprimir las primeras 5 recetas para verificar el contenido
print(corpus[:5])

[{'id': 1, 'name': 'Juicy roasted chicken', 'url': 'https://www.allrecipes.com/recipe/83557/juicy-roasted-chicken/'}, {'id': 2, 'name': 'To die for fettuccine alfredo', 'url': 'https://www.allrecipes.com/recipe/23431/to-die-for-fettuccine-alfredo/'}, {'id': 3, 'name': 'Juicy thanksgiving turkey', 'url': 'https://www.allrecipes.com/recipe/166160/juicy-thanksgiving-turkey/'}, {'id': 4, 'name': 'Empanadas beef turnovers', 'url': 'https://www.allrecipes.com/recipe/215231/empanadas-beef-turnovers/'}, {'id': 5, 'name': 'Cast iron pan seared steak oven finished', 'url': 'https://www.allrecipes.com/recipe/262181/cast-iron-pan-seared-steak-oven-finished/'}]
