In [3]:
import pandas as pd



from selenium.webdriver.common.by import By
from selenium import webdriver


### WEB SCRAPPING


In [4]:
def extract_element_text(driver, xpath, default=""):
    try:
        return driver.find_element(By.XPATH, xpath).text.strip()
    except Exception as e:
        print(f"Error extracting {xpath}: {e}")
        return default

In [5]:
def extract_elements(driver, xpath):
    try:
        return driver.find_elements(By.XPATH, xpath)
    except Exception as e:
        print(f"Error finding elements for {xpath}: {e}")
        return []

In [6]:
def scrape_recipe(driver):
    # Extract title
    titulo_xpath = '//h1[@class="article-heading text-headline-400"]'
    titulo = extract_element_text(driver, titulo_xpath)

    # Extract times and servings
    prep_time_xpath = '//div[contains(text(),"Prep Time:")]/following-sibling::div'
    cook_time_xpath = '//div[contains(text(),"Cook Time:")]/following-sibling::div'
    total_time_xpath = '//div[contains(text(),"Total Time:")]/following-sibling::div'
    servings_xpath = '//div[contains(text(),"Servings:")]/following-sibling::div'

    prep_time = extract_element_text(driver, prep_time_xpath)
    cook_time = extract_element_text(driver, cook_time_xpath)
    total_time = extract_element_text(driver, total_time_xpath)
    servings = extract_element_text(driver, servings_xpath)

    # Extract ingredients
    ingredients_xpath = '//ul[@class="mm-recipes-structured-ingredients__list"]/li'
    ingredient_elements = extract_elements(driver, ingredients_xpath)

    ingredientes = []
    for item in ingredient_elements:
        cantidad = extract_element_text(item, './/span[@data-ingredient-quantity="true"]')
        unidad = extract_element_text(item, './/span[@data-ingredient-unit="true"]')
        ingrediente = extract_element_text(item, './/span[@data-ingredient-name="true"]')
        ingredientes.append(f'{cantidad} {unidad} {ingrediente}'.strip())

    # Extract preparation steps
    steps_xpath = '//ol[@id="mntl-sc-block_1-0"]/li/p[not(contains(@class, "figure-article-caption"))]'
    step_elements = extract_elements(driver, steps_xpath)

    pasos = [
        f"Paso {i + 1}: {step.text.strip()}"
        for i, step in enumerate(step_elements)
        if step.text.strip()
    ]

    # Create DataFrame
    df = pd.DataFrame([{
        'titulo': titulo,
        'tiempo_preparacion': prep_time,
        'tiempo_cocina': cook_time,
        'tiempo_total': total_time,
        'porciones': servings,
        'ingredientes': '\n'.join(ingredientes),
        'pasos': '\n'.join(pasos)
    }])

    return df

### LLAMAR A LAS FUNCIONES Y REALIZAR LA EXTRACCIÓN DE DATOS DE LA PÁGINA

In [9]:
driver = webdriver.Chrome()

try:
    # Visit the target recipe page
    driver.get("https://www.allrecipes.com/recipe/256100/nutella-pastry-christmas-tree/")

    # Scrape the recipe and display results
    recipe_df = scrape_recipe(driver)
    display(recipe_df)  # Display DataFrame in Jupyter Notebook

    # Save the DataFrame to a CSV file
    recipe_df.to_csv('WebScrapping_Data/receta_data.csv', index=False)
    print("Receta data saved to WebScrapping/'receta_data.csv'")

    # Save the DataFrame to an Excel file
    recipe_df.to_excel('WebScrapping_Data/receta_data.xlsx', index=False)
    print("Receta data saved to WebScrapping/'receta_data.xlsx'")
finally:
    # Close the browser
    driver.quit()

Unnamed: 0,titulo,tiempo_preparacion,tiempo_cocina,tiempo_total,porciones,ingredientes,pasos
0,Nutella Puff Pastry Christmas Tree,35 mins,15 mins,55 mins,8,"1 (17.5 ounce) package frozen puff pastry, tha...",Paso 1: Preheat the oven to 375 degrees F (190...


Receta data saved to WebScrapping/'receta_data.csv'
Receta data saved to WebScrapping/'receta_data.xlsx'


### EXTRAER LA RECETA DE LA PÁGINA

In [33]:
 # Extraer título 
titulo_xpath = '//h1[@class="article-heading text-headline-400"]'
titulo = driver.find_element(By.XPATH, titulo_xpath).text.strip()

# Extraer tiempos y porciones
prep_time_xpath = '//div[contains(text(),"Prep Time:")]/following-sibling::div'
cook_time_xpath = '//div[contains(text(),"Cook Time:")]/following-sibling::div'
total_time_xpath = '//div[contains(text(),"Total Time:")]/following-sibling::div'
servings_xpath = '//div[contains(text(),"Servings:")]/following-sibling::div'

prep_time = driver.find_element(By.XPATH, prep_time_xpath).text.strip()
cook_time = driver.find_element(By.XPATH, cook_time_xpath).text.strip()
total_time = driver.find_element(By.XPATH, total_time_xpath).text.strip()
servings = driver.find_element(By.XPATH, servings_xpath).text.strip()

# Extraer ingredientes
ingredients_xpath = '//ul[@class="mm-recipes-structured-ingredients__list"]/li'
ingredient_elements = driver.find_elements(By.XPATH, ingredients_xpath)

ingredientes = []
for item in ingredient_elements:
    try:
        cantidad = item.find_element(By.XPATH, './/span[@data-ingredient-quantity="true"]').text.strip()
    except:
        cantidad = ''
    
    try:
        unidad = item.find_element(By.XPATH, './/span[@data-ingredient-unit="true"]').text.strip()
    except:
        unidad = ''
    
    try:
        ingrediente = item.find_element(By.XPATH, './/span[@data-ingredient-name="true"]').text.strip()
    except:
        ingrediente = ''
    
    ingredientes.append(f'{cantidad} {unidad} {ingrediente}'.strip())

# Extraer pasos de preparación
steps_xpath = '//ol[@id="mntl-sc-block_1-0"]/li/p[not(contains(@class, "figure-article-caption"))]'
step_elements = driver.find_elements(By.XPATH, steps_xpath)

# Crear lista de pasos
pasos = []
for i, step in enumerate(step_elements, 1):
    paso_texto = step.text.strip()
    if paso_texto:  # Solo agregar si el paso tiene texto
        pasos.append(f"Paso {i}: {paso_texto}")

# Crear DataFrame
df = pd.DataFrame([{
    'titulo': titulo,
    'tiempo_preparacion': prep_time,
    'tiempo_cocina': cook_time,
    'tiempo_total': total_time,
    'porciones': servings,
    'ingredientes': '\n'.join(ingredientes),
    'pasos': '\n'.join(pasos)  # Agregar los pasos como una nueva columna
}])

print(df)

# Cerrar el navegador
driver.quit()

                               titulo tiempo_preparacion tiempo_cocina  \
0  Nutella Puff Pastry Christmas Tree            35 mins       15 mins   

  tiempo_total porciones                                       ingredientes  \
0      55 mins         8  1 (17.5 ounce) package frozen puff pastry, tha...   

                                               pasos  
0  Paso 1: Preheat the oven to 375 degrees F (190...  


### GUARDAR EN UN DATASET

In [34]:
df.to_csv('Recetas.csv', index=False, encoding='utf-8-sig')
print("El archivo CSV ha sido creado exitosamente.")
print("\nContenido del DataFrame:")
print(df)

El archivo CSV ha sido creado exitosamente.

Contenido del DataFrame:
                               titulo tiempo_preparacion tiempo_cocina  \
0  Nutella Puff Pastry Christmas Tree            35 mins       15 mins   

  tiempo_total porciones                                       ingredientes  \
0      55 mins         8  1 (17.5 ounce) package frozen puff pastry, tha...   

                                               pasos  
0  Paso 1: Preheat the oven to 375 degrees F (190...  
