In [None]:
# vectorstore.py

import pandas as pd
from pathlib import Path
import shutil
from langchain.schema import Document
from langchain_chroma.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


"""
This script builds a fresh vector database of dish documents
using sentence embeddings for semantic search.

Each dish is represented as a `Document` object containing:
- A `page_content` string for embedding
- A `metadata` dictionary for filtering and additional inspection

We rely on `Chroma` as our local vectorstore, backed by `sentence-transformers/all-MiniLM-L6-v2`.
"""

# -------------------------
# 1. Load dataset
# -------------------------
df_path = Path("data/df_final_official.csv")
df = pd.read_csv(df_path).fillna({"ingredientes": "", "alergenos": ""})

# -------------------------
# 2. Build documents with full context
# -------------------------

"""
We construct the content string by concatenating relevant fields
such as ingredients, allergens, and nutrition facts.

Rationale:
- Nutritional fields like kcal, proteinas, etc. are included in `page_content`
  because they might carry semantic meaning for questions like
  "¿cuáles son los platos altos en proteínas?" or "platos bajos en calorías".

- At the same time, we also include these fields as structured `metadata`,
  enabling future use of filtered retrieval or hybrid querying (e.g., via SelfQueryRetriever).
"""

docs = []
for _, r in df.iterrows():
    content = (
        f"Nombre del plato: {r['nombre_plato']}. "
        f"Ingredientes: {r['ingredientes']}. "
        f"Alergenos: {r['alergenos']}. "
        f"Precio: {r['precio']} euros. "
        f"Kcal: {r['kcal']}, Proteínas: {r['proteinas']}, "
        f"Hidratos: {r['hidratos']}, Grasas: {r['grasas']}, Peso: {r['peso']}g."
    )

    metadata = {
        field: (bool(r[field]) if isinstance(r[field], (int, float)) and field.startswith(("is_", "bajo_", "es_", "de_", "alto_", "sin_")) else r[field])
        for field in [
            "nombre_plato", "precio", "kcal", "proteinas", "hidratos", "grasas", "peso", "alergenos",
            "is_vegetariano", "is_vegano", "is_keto", "bajo_en_calorias", "es_postre",
            "de_cuchara", "alto_proteina", "sin_lactosa", "is_gourmet", "para_diabeticos",
            "sin_gluten", "congelar"
        ]
    }

    docs.append(Document(page_content=content, metadata=metadata))

# -------------------------
# 3. Load embedding model
# -------------------------
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# -------------------------
# 4. Delete and recreate vectorstore
# -------------------------

"""
We delete the existing Chroma vectorstore directory before creating a new one.

Note:
This ensures that the index is rebuilt from scratch, which is important
if the documents or metadata have changed. Chroma does not automatically update
existing embeddings if data changes, so cleaning is safer for reproducibility.
"""

persist_dir = Path("chroma_db")

if persist_dir.exists():
    print("🧹 Removing existing Chroma vector DB...")
    shutil.rmtree(persist_dir)

print("🆕 Creating fresh Chroma vector DB...")
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="menu_platos",
    persist_directory=str(persist_dir)
)
print(f"✅ New vector DB created with {len(docs)} documents and full metadata")




  from .autonotebook import tqdm as notebook_tqdm


🆕 Creating fresh Chroma vector DB...
✅ New vector DB created with 215 documents and full metadata


## Verification of the Vectorstore Content Before Usage
Before building any chains or using the vectorstore in production, it is essential to verify that the vector database has been correctly populated. This notebook performs two types of validation:

1. Semantic retrieval check – to confirm that document embeddings are working as expected.
2. Raw inspection of vectorstore contents – to ensure that all expected fields (especially metadata) were properly stored and indexed.

These checks ensure both the integrity of the document structure and the accuracy of the retrieval logic before any LLM-based interaction begins.

In [None]:
# This block reopens the Chroma vectorstore and performs a similarity search
# to confirm that documents are correctly indexed and retrievable.

# It loads the same embedding model used during ingestion and performs a top-k search 
# using the keyword "plato" to get a representative sample of the embedded content.

# For each result, it prints:
# - A preview of the embedded content (page_content)
# - The associated metadata dictionary

from langchain_chroma.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma(
    persist_directory="chroma_db",
    embedding_function=embedding,
    collection_name="menu_platos"
)

docs = vectordb.similarity_search("plato", k=40)

for i, d in enumerate(docs, 1):
    print(f"\n📄 Document {i}")
    print(f"Content: {d.page_content[:500]}...")  # Truncated content preview
    print(f"Metadata: {d.metadata}")


📄 Document 1
Content: Nombre del plato: Cocido Madrileño. Ingredientes: Agua, Garbanzos 30.62%, Patatas, Cerdo, Pollo, Col, Zanahoria, Cebolla, Chorizo, Morcilla, Aceite de Oliva, Caldo de Carne, Sal y Hueso de Jamón. Alergenos: . Precio: 5.95 euros. Kcal: 101.0, Proteínas: 6.2, Hidratos: 6.3, Grasas: 5.3, Peso: 380.0g....
Metadata: {'peso': 380.0, 'es_postre': False, 'grasas': 5.3, 'bajo_en_calorias': False, 'nombre_plato': 'Cocido Madrileño', 'de_cuchara': True, 'congelar': False, 'proteinas': 6.2, 'precio': 5.95, 'sin_gluten': True, 'para_diabeticos': False, 'alto_proteina': False, 'sin_lactosa': True, 'kcal': 101.0, 'is_keto': False, 'hidratos': 6.3, 'alergenos': '', 'is_vegano': False, 'is_gourmet': False, 'is_vegetariano': False}

📄 Document 2
Content: Nombre del plato: Arroz con Pollo. Ingredientes: Pollo 29.33%, Agua, Arroz 19.69%, Cebolla, Vino, Zanahoria, Pimiento, Aceite de Oliva, Ajo, Caldo de Pollo, Sal, Colorante Alimentario y Pimienta Blanca. Alergenos: . Precio: 6.45 e

In [3]:
docs

[Document(id='3b4f311b-9b4c-4e53-9935-6b58804418bb', metadata={'peso': 380.0, 'es_postre': False, 'grasas': 5.3, 'bajo_en_calorias': False, 'nombre_plato': 'Cocido Madrileño', 'de_cuchara': True, 'congelar': False, 'proteinas': 6.2, 'precio': 5.95, 'sin_gluten': True, 'para_diabeticos': False, 'alto_proteina': False, 'sin_lactosa': True, 'kcal': 101.0, 'is_keto': False, 'hidratos': 6.3, 'alergenos': '', 'is_vegano': False, 'is_gourmet': False, 'is_vegetariano': False}, page_content='Nombre del plato: Cocido Madrileño. Ingredientes: Agua, Garbanzos 30.62%, Patatas, Cerdo, Pollo, Col, Zanahoria, Cebolla, Chorizo, Morcilla, Aceite de Oliva, Caldo de Carne, Sal y Hueso de Jamón. Alergenos: . Precio: 5.95 euros. Kcal: 101.0, Proteínas: 6.2, Hidratos: 6.3, Grasas: 5.3, Peso: 380.0g.'),
 Document(id='5b0de8ce-2c0e-45ca-ac0e-a330acd356f3', metadata={'sin_lactosa': True, 'is_keto': False, 'peso': 430.0, 'de_cuchara': False, 'es_postre': False, 'alto_proteina': False, 'kcal': 154.0, 'is_gourmet'

In [None]:
# This block accesses the internal raw representation of the Chroma vectorstore.
# It is useful to verify that all documents and their metadata were stored as expected.

# It pulls out:
# - All documents as raw text (used for embedding)
# - All metadata dictionaries linked to each document

# Then, it filters the list to show only documents where the word "pollo" appears
# in the embedded content. This acts as both a quality control and a semantic test.

from langchain_chroma.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma(
    persist_directory="chroma_db",
    embedding_function=embedding,
    collection_name="menu_platos"
)

# Access raw documents and metadata via internal collection
collection = vectordb._collection
stored = collection.get(include=["documents", "metadatas"])

docs = stored["documents"]
metas = stored["metadatas"]

# Filter for documents whose content includes the word "pollo"
pollo_docs = [
    (meta.get("nombre_plato", "Unknown"), doc)
    for doc, meta in zip(docs, metas)
    if "pollo" in doc.lower()
]

print(f"✅ Found {len(pollo_docs)} documents with 'pollo'")

# Print a few sample results to validate indexing
for name, doc in pollo_docs[:10]:
    print(f"\n🍗 {name}\n{doc[:150]}...")


✅ Found 40 documents with 'pollo'

🍗 Ají de Pollo
Nombre del plato: Ají de Pollo. Ingredientes: Pollo, 26.38%, Agua, Cebolla, Huevo (HUEVO), leche (LÁCTEOS), Arroz, Pan (GLUTEN), Almendra (FRUTOS DE C...

🍗 Arroz Campero
Nombre del plato: Arroz Campero. Ingredientes: Agua, Pollo, Pimientos, Arroz 13.23%, Chorizo, Morcilla, Tomate, Aceite de Oliva, Pimentón Dulce, Ajo, ...

🍗 Arroz con Pollo
Nombre del plato: Arroz con Pollo. Ingredientes: Pollo 29.33%, Agua, Arroz 19.69%, Cebolla, Vino, Zanahoria, Pimiento, Aceite de Oliva, Ajo, Caldo de ...

🍗 Albóndigas de Pollo con Pasta
Nombre del plato: Albóndigas de Pollo con Pasta. Ingredientes: Pollo 28.05%, Tomate, Cebolla, Agua, Espaguetis(GLUTEN) 7.61%, Pan, Aceite de Oliva, Za...

🍗 Albóndigas de Pollo con Tomate
Nombre del plato: Albóndigas de Pollo con Tomate. Ingredientes: Pollo 31.85%, Cebolla, Zanahoria, Ajo, Huevo (HUEVO), Sal, leche Sin Lactosa(LÁCTEOS),...

🍗 Albóndigas en Salsa de la Abuela
Nombre del plato: Albóndigas en Salsa de 