In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import random
from urllib.parse import urljoin, urlparse
import re

class AllRecipesCrawler:
    def __init__(self, output_dir="data"):
        self.output_dir = output_dir
        self.session = requests.Session()
        
        # Headers para simular un navegador real
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        os.makedirs(output_dir, exist_ok=True)
        
        self.downloaded_recipes = set()
        self.recipe_count = 0
        self.max_recipes = 100

    def get_page_content(self, url, retries=1):
        """Obtener contenido de una página con reintentos"""
        for attempt in range(retries):
            try:
                time.sleep(random.uniform(1, 3))
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                return response.text
                
            except Exception as e:
                print(f"Error en intento {attempt + 1} para {url}: {e}")
                if attempt < retries - 1:
                    time.sleep(random.uniform(2, 5))
                else:
                    return None

    def extract_recipe_links(self, html_content, base_url):
        """Extraer enlaces de recetas de una página"""
        soup = BeautifulSoup(html_content, 'html.parser')
        recipe_links = []
        
        # Buscar enlaces que contengan /recipe/ en la URL
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '/recipe/' in href and '/recipe/' in href:
                # Limpiar la URL (remover parámetros, anclas, etc.)
                clean_url = href.split('?')[0].split('#')[0]
                full_url = urljoin(base_url, clean_url)
                
                # Verificar que sea una URL de receta válida
                if re.match(r'https://www\.allrecipes\.com/recipe/\d+/.+/$', full_url):
                    if full_url not in self.downloaded_recipes:
                        recipe_links.append(full_url)
        
        return list(set(recipe_links))  # Eliminar duplicados

    def save_recipe_html(self, url, html_content):
        """Guardar HTML de receta"""
        recipe_id = re.search(r'/recipe/(\d+)/', url)
        recipe_name = re.search(r'/recipe/\d+/([^/]+)/', url)
        
        if recipe_id and recipe_name:
            filename = f"{recipe_id.group(1)}_{recipe_name.group(1)}.html"
        else:
            filename = f"recipe_{self.recipe_count + 1}.html"
        
        filepath = os.path.join(self.output_dir, filename)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        return filepath

    def generate_recipe_urls(self, start_id=200000, end_id=300000, sample_size=200):
        """Generar URLs de recetas basadas en IDs secuenciales"""
        print(f" Generando URLs candidatas...")
        
        # Generar IDs aleatorios en el rango
        recipe_ids = random.sample(range(start_id, end_id), sample_size)
        
        candidate_urls = []
        for recipe_id in recipe_ids:
            # Usar las recetas existentes como plantilla para nombres
            url = f"https://www.allrecipes.com/recipe/{recipe_id}/"
            candidate_urls.append(url)
        
        return candidate_urls

    def crawl_from_working_recipes(self, seed_urls):
        """Extraer enlaces desde recetas que sabemos que funcionan"""
        discovered_urls = []
        
        for seed_url in seed_urls:
            print(f"🔍 Buscando enlaces desde: {seed_url}")
            content = self.get_page_content(seed_url)
            
            if content:
                links = self.extract_recipe_links(content, seed_url)
                discovered_urls.extend(links)
                print(f"   Encontrados {len(links)} enlaces nuevos")
        
        return discovered_urls

    def test_url_batch(self, urls, batch_size=10):
        """Probar un lote de URLs para ver cuáles funcionan"""
        working_urls = []
        
        for i in range(0, len(urls), batch_size):
            batch = urls[i:i+batch_size]
            print(f"🧪 Probando lote {i//batch_size + 1} ({len(batch)} URLs)...")
            
            for url in batch:
                if self.recipe_count >= self.max_recipes:
                    break
                    
                content = self.get_page_content(url)
                if content and "recipe" in content.lower():
                    working_urls.append(url)
                    print(f"   ✅ URL válida encontrada: {url}")
            
            if self.recipe_count >= self.max_recipes:
                break
        
        return working_urls

    def crawl_recipes(self, seed_urls):
        """Proceso principal de crawling mejorado"""
        print(f"🚀 Iniciando crawling para obtener {self.max_recipes} recetas...")
        
        all_candidate_urls = []
        
        # 1. Descargar recetas semilla
        print("📥 Descargando recetas semilla...")
        for seed_url in seed_urls:
            content = self.get_page_content(seed_url)
            if content:
                self.save_recipe_html(seed_url, content)
                self.downloaded_recipes.add(seed_url)
                self.recipe_count += 1
                print(f"✅ Guardado: {seed_url.split('/')[-2]} ({self.recipe_count}/{self.max_recipes})")
        
        # 2. Buscar enlaces desde recetas existentes
        discovered_urls = self.crawl_from_working_recipes(seed_urls)
        all_candidate_urls.extend(discovered_urls)
        
        # 3. Generar URLs candidatas por ID
        if len(all_candidate_urls) < self.max_recipes * 2:
            generated_urls = self.generate_recipe_urls(sample_size=300)
            all_candidate_urls.extend(generated_urls)
        
        # 4. Probar y descargar URLs válidas
        print(f"📋 Total de URLs candidatas: {len(all_candidate_urls)}")
        
        # Mezclar URLs para variedad
        random.shuffle(all_candidate_urls)
        
        for url in all_candidate_urls:
            if self.recipe_count >= self.max_recipes:
                break
                
            if url not in self.downloaded_recipes:
                content = self.get_page_content(url)
                
                if content and len(content) > 5000:  # Verificar que sea contenido sustancial
                    self.save_recipe_html(url, content)
                    self.downloaded_recipes.add(url)
                    self.recipe_count += 1
                    print(f"✅ Guardado: {url.split('/')[-2]} ({self.recipe_count}/{self.max_recipes})")
                    
                    # Buscar más enlaces desde esta receta
                    if self.recipe_count < self.max_recipes * 0.8:  # Solo hasta el 80%
                        new_links = self.extract_recipe_links(content, url)
                        for new_link in new_links[:5]:  # Agregar max 5 por receta
                            if new_link not in all_candidate_urls:
                                all_candidate_urls.append(new_link)
        
        print(f"🎉 Crawling completado! {self.recipe_count} recetas descargadas en '{self.output_dir}'")
        return self.recipe_count

# Usar el crawler mejorado
if __name__ == "__main__":
    # Tus URLs semilla que ya sabemos que funcionan
    seed_urls = [
        "https://www.allrecipes.com/recipe/212965/crab-omelet/",
        "https://www.allrecipes.com/recipe/262439/sinigang-na-bangus-filipino-milkfish-in-tamarind-broth/",
        "https://www.allrecipes.com/recipe/212994/pinakbet/",
        "https://www.allrecipes.com/recipe/216032/apple-cider-stew/",
        "https://www.allrecipes.com/recipe/283947/michigan-hot-dogs/",
        "https://www.allrecipes.com/recipe/246237/easy-no-measure-smores-bars/",
        "https://www.allrecipes.com/recipe/216693/reuben-pierogie-casserole/",
        "https://www.allrecipes.com/recipe/18325/sauerkraut-casserole/"
    ]
    
    crawler = AllRecipesCrawler(output_dir="data")
    total_downloaded = crawler.crawl_recipes(seed_urls)
    
    print(f"\n📊 Resumen final:")
    print(f"   Total descargado: {total_downloaded} recetas")
    print(f"   Directorio: data/")

🚀 Iniciando crawling para obtener 100 recetas...
📥 Descargando recetas semilla...
✅ Guardado: crab-omelet (1/100)
✅ Guardado: sinigang-na-bangus-filipino-milkfish-in-tamarind-broth (2/100)
✅ Guardado: pinakbet (3/100)
✅ Guardado: apple-cider-stew (4/100)
✅ Guardado: michigan-hot-dogs (5/100)
✅ Guardado: easy-no-measure-smores-bars (6/100)
✅ Guardado: reuben-pierogie-casserole (7/100)
✅ Guardado: sauerkraut-casserole (8/100)
🔍 Buscando enlaces desde: https://www.allrecipes.com/recipe/212965/crab-omelet/
   Encontrados 14 enlaces nuevos
🔍 Buscando enlaces desde: https://www.allrecipes.com/recipe/262439/sinigang-na-bangus-filipino-milkfish-in-tamarind-broth/
   Encontrados 14 enlaces nuevos
🔍 Buscando enlaces desde: https://www.allrecipes.com/recipe/212994/pinakbet/
   Encontrados 14 enlaces nuevos
🔍 Buscando enlaces desde: https://www.allrecipes.com/recipe/216032/apple-cider-stew/
   Encontrados 16 enlaces nuevos
🔍 Buscando enlaces desde: https://www.allrecipes.com/recipe/283947/michigan