In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from webdriver_manager.firefox import GeckoDriverManager
import pandas as pd
import time
import random
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
class RecipeScraper:
    def __init__(self, max_retries=3, timeout=10):
        self.max_retries = max_retries
        self.timeout = timeout
        self.data_recetas = []
        self.corpus = []  # Corpus para títulos y descripciones
        self.setup_driver()
        self.recipe_count = 0

    def setup_driver(self):
        options = webdriver.FirefoxOptions()
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Firefox(
            service=Service(GeckoDriverManager().install()),
            options=options
        )
        self.wait = WebDriverWait(self.driver, self.timeout)
    
    def random_delay(self, min_delay=1, max_delay=3):
        time.sleep(random.uniform(min_delay, max_delay))

    def find_element_safely(self, by, value, parent=None):
        for attempt in range(self.max_retries):
            try:
                if parent:
                    return parent.find_element(by, value)
                return self.wait.until(EC.presence_of_element_located((by, value)))
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                if attempt == self.max_retries - 1:
                    raise e
                self.random_delay()

    def find_elements_safely(self, by, value, parent=None):
        for attempt in range(self.max_retries):
            try:
                if parent:
                    return parent.find_elements(by, value)
                return self.wait.until(EC.presence_of_all_elements_located((by, value)))
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                if attempt == self.max_retries - 1:
                    raise e
                self.random_delay()

    def scrape_categories(self):
        self.driver.get("https://www.allrecipes.com/ingredients-a-z-6740416")
        
        for i in range(1, 17):
            try:
                div_element = self.find_element_safely(
                    By.XPATH, f'/html/body/main/div[2]/div[{i}]'
                )
                categoria = self.find_element_safely(
                    By.TAG_NAME, 'h3', div_element
                ).text.strip()
                
                ul_element = self.find_element_safely(
                    By.XPATH, f'/html/body/main/div[2]/div[{i}]/ul'
                )
                
                li_elements = self.find_elements_safely(By.TAG_NAME, 'li', ul_element)
                
                for li in li_elements:
                    try:
                        a_element = self.find_element_safely(By.TAG_NAME, 'a', li)
                        ingrediente = a_element.text.strip()
                        link_ingrediente = a_element.get_attribute('href')
                        
                        # Solo guardamos la información básica del ingrediente
                        self.data_recetas.append({
                            'categoria': categoria,
                            'ingrediente': ingrediente,
                            'link_ingrediente': link_ingrediente
                        })
                    except Exception as e:
                        continue
            except Exception as e:
                continue

    def preprocess_text(self, text):
        text = text.lower()  # Convertir a minúsculas
        text = re.sub(r'[^a-záéíóúüñ\s]', '', text)  # Eliminar caracteres especiales
        tokens = word_tokenize(text, language='spanish')  # Tokenización
        stop_words = set(stopwords.words('spanish'))  # Palabras vacías en español
        filtered_tokens = [word for word in tokens if word not in stop_words]  # Eliminar stopwords
        return ' '.join(filtered_tokens)

    def extract_recipe_info(self, link_element, base_info):
        try:
            link_receta = link_element.get_attribute('href')
            titulo_receta = self.find_element_safely(
                By.XPATH, './/div[2]/span/span', link_element
            ).text.strip()
            
            # Preprocesar el título
            preprocessed_title = self.preprocess_text(titulo_receta)
            
            # Ingresar al link de la receta para extraer la descripción
            self.driver.get(link_receta)
            self.random_delay()
            
            # Buscar el párrafo de la descripción dentro del xpath especificado
            description_element = self.find_element_safely(By.XPATH, "//*[@id='article-header--recipe_1-0']//p")
            descripcion_receta = description_element.text.strip() if description_element else "Descripción no disponible"
            
            # Preprocesar la descripción
            preprocessed_description = self.preprocess_text(descripcion_receta)
            
            # Agregar al corpus
            self.corpus.append(preprocessed_title + ' ' + preprocessed_description)
            
            return {
                'categoria': base_info['categoria'],
                'ingrediente': base_info['ingrediente'],
                'link_ingrediente': base_info['link_ingrediente'],
                'link_receta': link_receta,
                'titulo_receta': titulo_receta,
                'descripcion_receta': descripcion_receta
            }
        except Exception as e:
            return None

    def scrape_recipes(self, max_recipes=500):
        recipes_data = []
        
        for ingrediente_info in self.data_recetas:
            if self.recipe_count >= max_recipes:
                break
                
            try:
                self.driver.get(ingrediente_info['link_ingrediente'])
                self.random_delay()
                
                for xpath in [
                    '//*[@id="mntl-document-spotlight_1-0"]',
                    '//*[@id="mntl-taxonomysc-article-list-group_1-0"]'
                ]:
                    if self.recipe_count >= max_recipes:
                        break
                        
                    try:
                        container = self.find_element_safely(By.XPATH, xpath)
                        recipe_links = self.find_elements_safely(By.TAG_NAME, 'a', container)
                        
                        for link in recipe_links:
                            if self.recipe_count >= max_recipes:
                                break
                                
                            recipe_info = self.extract_recipe_info(link, ingrediente_info)
                            if recipe_info:
                                recipes_data.append(recipe_info)
                                self.recipe_count += 1
                    
                    except Exception as e:
                        continue
                
            except Exception as e:
                continue
        
        self.data_recetas = recipes_data

    def save_data(self, filename='ingredientes_recetas_completo.csv', corpus_filename='corpus.txt'):
        try:
            df = pd.DataFrame(self.data_recetas)
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"Se extrajeron {self.recipe_count} recetas en total")
            
            with open(corpus_filename, 'w', encoding='utf-8') as f:
                for item in self.corpus:
                    f.write("%s\n" % item)
            print(f"Corpus guardado en {corpus_filename}")
        except Exception as e:
            print(f"Error guardando datos: {str(e)}")

    def cleanup(self):
        try:
            self.driver.quit()
        except Exception as e:
            print(f"Error cerrando el navegador: {str(e)}")


In [3]:
    def find_element_safely(self, by, value, parent=None):
        for attempt in range(self.max_retries):
            try:
                if parent:
                    return parent.find_element(by, value)
                return self.wait.until(EC.presence_of_element_located((by, value)))
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                if attempt == self.max_retries - 1:
                    raise e
                self.random_delay()

    def find_elements_safely(self, by, value, parent=None):
        for attempt in range(self.max_retries):
            try:
                if parent:
                    return parent.find_elements(by, value)
                return self.wait.until(EC.presence_of_all_elements_located((by, value)))
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                if attempt == self.max_retries - 1:
                    raise e
                self.random_delay()


In [4]:
    def scrape_categories(self):
        """Extrae las categorías e ingredientes iniciales"""
        self.driver.get("https://www.allrecipes.com/ingredients-a-z-6740416")
        
        for i in range(1, 17):
            try:
                div_element = self.find_element_safely(
                    By.XPATH, f'/html/body/main/div[2]/div[{i}]'
                )
                categoria = self.find_element_safely(
                    By.TAG_NAME, 'h3', div_element
                ).text.strip()
                
                ul_element = self.find_element_safely(
                    By.XPATH, f'/html/body/main/div[2]/div[{i}]/ul'
                )
                
                li_elements = self.find_elements_safely(By.TAG_NAME, 'li', ul_element)
                
                for li in li_elements:
                    try:
                        a_element = self.find_element_safely(By.TAG_NAME, 'a', li)
                        ingrediente = a_element.text.strip()
                        link_ingrediente = a_element.get_attribute('href')
                        
                        # Solo guardamos la información básica del ingrediente
                        self.data_recetas.append({
                            'categoria': categoria,
                            'ingrediente': ingrediente,
                            'link_ingrediente': link_ingrediente
                        })
                    except Exception as e:
                        continue
            except Exception as e:
                continue


In [5]:
    def preprocess_text(self, text):
        """Preprocesa el texto: convierte a minúsculas, elimina caracteres especiales y stopwords"""
        text = text.lower()  # Convertir a minúsculas
        text = re.sub(r'[^a-záéíóúüñ\s]', '', text)  # Eliminar caracteres especiales
        tokens = word_tokenize(text, language='spanish')  # Tokenización
        stop_words = set(stopwords.words('spanish'))  # Palabras vacías en español
        filtered_tokens = [word for word in tokens if word not in stop_words]  # Eliminar stopwords
        return ' '.join(filtered_tokens)


In [6]:
    def extract_recipe_info(self, link_element, base_info):
        """Extrae información de una receta individual"""
        try:
            link_receta = link_element.get_attribute('href')
            titulo_receta = self.find_element_safely(
                By.XPATH, './/div[2]/span/span', link_element
            ).text.strip()
            
            # Preprocesar el título
            preprocessed_title = self.preprocess_text(titulo_receta)
            
            # Ingresar al link de la receta para extraer la descripción
            self.driver.get(link_receta)
            self.random_delay()
            
            # Buscar el párrafo de la descripción dentro del xpath especificado
            description_element = self.find_element_safely(By.XPATH, "//*[@id='article-header--recipe_1-0']//p")
            descripcion_receta = description_element.text.strip() if description_element else "Descripción no disponible"
            
            # Preprocesar la descripción
            preprocessed_description = self.preprocess_text(descripcion_receta)
            
            # Agregar al corpus
            self.corpus.append(preprocessed_title + ' ' + preprocessed_description)
            
            return {
                'categoria': base_info['categoria'],
                'ingrediente': base_info['ingrediente'],
                'link_ingrediente': base_info['link_ingrediente'],
                'link_receta': link_receta,
                'titulo_receta': titulo_receta,
                'descripcion_receta': descripcion_receta
            }
        except Exception as e:
            return None


In [7]:
    def scrape_recipes(self, max_recipes=500):
        """Extrae las recetas para cada ingrediente hasta alcanzar el límite"""
        recipes_data = []
        
        for ingrediente_info in self.data_recetas:
            if self.recipe_count >= max_recipes:
                break
                
            try:
                self.driver.get(ingrediente_info['link_ingrediente'])
                self.random_delay()
                
                # Intentar ambas secciones de recetas
                for xpath in [
                    '//*[@id="mntl-document-spotlight_1-0"]',
                    '//*[@id="mntl-taxonomysc-article-list-group_1-0"]'
                ]:
                    if self.recipe_count >= max_recipes:
                        break
                        
                    try:
                        container = self.find_element_safely(By.XPATH, xpath)
                        recipe_links = self.find_elements_safely(By.TAG_NAME, 'a', container)
                        
                        for link in recipe_links:
                            if self.recipe_count >= max_recipes:
                                break
                                
                            recipe_info = self.extract_recipe_info(link, ingrediente_info)
                            if recipe_info:
                                recipes_data.append(recipe_info)
                                self.recipe_count += 1
                    
                    except Exception as e:
                        continue
                
            except Exception as e:
                continue
        
        self.data_recetas = recipes_data  # Reemplazamos con los datos completos de recetas


In [8]:
    def save_data(self, filename='ingredientes_recetas_completo.csv', corpus_filename='corpus.txt'):
        try:
            # Guardar datos en CSV
            df = pd.DataFrame(self.data_recetas)
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"Se extrajeron {self.recipe_count} recetas en total")
            
            # Guardar el corpus en un archivo de texto
            with open(corpus_filename, 'w', encoding='utf-8') as f:
                for item in self.corpus:
                    f.write("%s\n" % item)
            print(f"Corpus guardado en {corpus_filename}")
        except Exception as e:
            print(f"Error guardando datos: {str(e)}")


In [9]:
    def cleanup(self):
        try:
            self.driver.quit()
        except Exception as e:
            print(f"Error cerrando el navegador: {str(e)}")


In [10]:
def main():
    scraper = None
    try:
        scraper = RecipeScraper()
        scraper.scrape_categories()
        scraper.scrape_recipes(max_recipes=3)  # Límite de 500 recetas
        scraper.save_data()
    except Exception as e:
        print(f"Error en la ejecución principal: {str(e)}")
    finally:
        if scraper:
            scraper.cleanup()

if __name__ == "__main__":
    main()


Se extrajeron 3 recetas en total
Corpus guardado en corpus.txt
