In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/html2-1/Unstuffed Cabbage Roll Recipe.html
/kaggle/input/receta-2/Chicken Cobbler Recipe.html


In [2]:
from bs4 import BeautifulSoup
import os

In [3]:
ruta_archivo = "/kaggle/input/html2-1/Unstuffed Cabbage Roll Recipe.html"

In [4]:
# Usamos 'utf-8' para evitar problemas con tildes o caracteres especiales
with open(ruta_archivo, 'r', encoding='utf-8') as archivo:
    contenido_html = archivo.read()

In [7]:
soup = BeautifulSoup(contenido_html, 'html.parser')

### Titulo de la receta

In [8]:
# Extracting the recipe title
title = soup.find("title")
title.string

'Unstuffed Cabbage Roll Recipe'

In [9]:
#score
#description
#prep time
#cooking time
#total time
#servings
#ingredients
#directtions
#nutritions facts
#Reviews

### puntaje de la receta

In [10]:
score=soup.find("div", {"id":"mm-recipes-review-bar__rating_1-0"})
score.string

'4.6'

### Descripcion de la receta

In [11]:
description=soup.find("p", {"class":"article-subheading text-utility-300"})
description.string

"This is an easy casserole made with ground beef, cabbage, garlic, and tomatoes. My kids don't even like cabbage, but they love this dish! Serve with rice for a comforting weeknight dinner. Also, the longer it stands the better it tastes!"

### tiempo de preparacion

In [12]:
prep_time=soup.find("div", {"class":"mm-recipes-details__value"})
prep_time.string

'15 mins'

# Funcion para obtener datos especificos de la receta

In [13]:
def parsearReceta(html_doc_path):
    with open(html_doc_path, 'r', encoding='utf-8') as archivo:
        html_doc = archivo.read()

    soup = BeautifulSoup(html_doc, 'html.parser')
    
    score=soup.find("div", {"id":"mm-recipes-review-bar__rating_1-0"}).string
    prep_time=soup.find("div", {"class":"mm-recipes-details__value"}).string
    description=soup.find("p", {"class":"article-subheading text-utility-300"}).string
    title = soup.find("title").string
    return {"score":score,"prep_time":prep_time, "description":description, "title":title}

In [14]:
htmlparseado = parsearReceta(ruta_archivo)

htmlparseado

{'score': '4.6',
 'prep_time': '15 mins',
 'description': "This is an easy casserole made with ground beef, cabbage, garlic, and tomatoes. My kids don't even like cabbage, but they love this dish! Serve with rice for a comforting weeknight dinner. Also, the longer it stands the better it tastes!",
 'title': 'Unstuffed Cabbage Roll Recipe'}

In [15]:
ruta_receta2 = "/kaggle/input/receta-2/Chicken Cobbler Recipe.html"

In [16]:
receta2 = parsearReceta(ruta_receta2)
receta2

{'score': '4.3',
 'prep_time': '5 mins',
 'description': "This viral TikTok chicken cobbler made with Red Lobster biscuit mix is incredibly easy to put together. We love this recipe from Matthew Bounds, aka @yourbarefootneighbor. Try it yourself to find out what everyone's raving about!",
 'title': 'Chicken Cobbler Recipe'}

### Obtener los links

In [17]:
# Buscamos todos los tags 'a' que tengan un enlace (href)
recipe_links = soup.find_all("a", href=True)

recipe_urls = []

# Recorremos cada enlace encontrado
for link in recipe_links:
    href = link['href']
    # Filtramos: solo guardamos el link si contiene la palabra "recipe"
    if "recipe" in href:
        recipe_urls.append(href)

# Imprimimos los resultados
print(f"Se encontraron {len(recipe_urls)} recetas vinculadas:")

for url in recipe_urls:
    print(url)

Se encontraron 204 recetas vinculadas:
https://www.allrecipes.com/authentication/login?regSource=3675&relativeRedirectUrl=%2Frecipe%2F235997%2Funstuffed-cabbage-roll%2F
/account/add-recipe
https://www.myrecipes.com/favorites
https://support.people.inc/hc/en-us/categories/360003648613-Allrecipes
https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2Frecipe%2F235997%2Funstuffed-cabbage-roll%2F
https://www.allrecipes.com/recipes/17562/dinner/
https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/
https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/
https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/
https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/
https://www.allrecipes.com/recipes/17889/everyday-cooking/family-friendly/family-dinners/
https://www.allrecipes.com/recipes/94/soups-stews-and-chili/
https://www.allrecipes.com/recipes/16099/everyday-co

# RAG

In [22]:
!pip install -q sentence-transformers faiss-cpu google-genai

## Configuracion API

In [29]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from google import genai
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("Gemini_API_KEY")
client = genai.Client(api_key=api_key)

## Preparacion de datos

In [30]:
# texto limpio de la receta
texto_receta = soup.get_text(separator="\n", strip=True)

# función de chunking 
def chunk_text(text: str, max_chars: int = 500, overlap: int = 50):
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        chunk = text[start:end].strip()
        if len(chunk) > 0:
            chunks.append(chunk)
        if end == n:
            break
        start = max(0, end - overlap)
    return chunks

# DataFrame
chunks_lista = chunk_text(texto_receta)
df = pd.DataFrame(chunks_lista, columns=['text'])
print(f"Total de chunks generados: {len(df)}")

Total de chunks generados: 26


## Generacion de embeddings

In [31]:
model_embed = SentenceTransformer("intfloat/e5-base-v2")

passages = ["passage: " + t for t in df["text"].tolist()]

embeddings = model_embed.encode(
    passages,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True 
).astype("float32")

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print("Índice FAISS creado.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Índice FAISS creado.


## Funcion Busqueda

In [40]:
def consultar_rag(pregunta):
    
    vec_pregunta = model_embed.encode(
        ["query: " + pregunta], 
        convert_to_numpy=True, 
        normalize_embeddings=True
    ).astype("float32")
    
    # los 3 chunks más parecidos
    D, I = index.search(vec_pregunta, k=3)
    
    # Recuperar texto de los índices encontrados
    indices_encontrados = I[0]
    contextos = df.iloc[indices_encontrados]['text'].tolist()
    contexto_fusionado = "\n---\n".join(contextos)
    
    # Enviar a Gemini
    prompt = f"""
    Eres un asistente de cocina experto. Usa SOLO la siguiente información recuperada para responder.
    
    Información recuperada:
    {contexto_fusionado}
    
    Pregunta del usuario: {pregunta}
    """
    
    
    response = client.models.generate_content(
        model="gemini-2.5-flash", 
        contents=prompt
    )
    
    return response.text

In [39]:
print("--- Modelos disponibles ---")
for m in client.models.list():
    # Simplemente imprimimos el nombre (o display_name si existe)
    print(m.name)

--- Modelos disponibles ---
models/embedding-gecko-001
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-flash-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/gemini-2.5-

In [41]:
pregunta = "¿Cuáles son los ingredientes principales?"
respuesta = consultar_rag(pregunta)

print(f"\nPREGUNTA: {pregunta}")
print(f"RESPUESTA: {respuesta}")


PREGUNTA: ¿Cuáles son los ingredientes principales?
RESPUESTA: Basándome en la información proporcionada, los ingredientes principales son:

*   Pollo (Chicken)
*   Ternera (Beef)
*   Cerdo (Pork)
*   Mariscos (Seafood)
*   Pasta
*   Frutas
*   Verduras
