In [8]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [None]:
from pathlib import Path
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import wikipediaapi
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import chromadb
from chromadb.config import Settings
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
DATA_DIR = Path("../data")
OUT_DIR = Path("../outputs")
PERSIST_DIR = DATA_DIR / "chroma"
DATA_DIR.mkdir(exist_ok=True, parents=True)
OUT_DIR.mkdir(exist_ok=True, parents=True)

CSV_RAW = DATA_DIR / "wiki_corpus.csv"
CSV_CHUNKS = DATA_DIR / "wiki_chunks.csv"

TOPIC = "Federated_learning"  # puedes cambiarlo (en/es)

In [None]:
wiki = wikipediaapi.Wikipedia(language='en', user_agent='rag-wikipedia-lab/1.0')
page = wiki.page(TOPIC)

if not page.exists():
    raise ValueError(f"La página '{TOPIC}' no existe.")

title = page.title
text = page.text  # texto plano, ideal para NLP

# Guardamos un solo registro: id, title, text
df = pd.DataFrame([{"id": f"wiki:{TOPIC}", "title": title, "text": text}])
df.to_csv(CSV_RAW, index=False, encoding="utf-8")

print(f"[OK] {CSV_RAW} escrito. Título: {title}. Longitud texto: {len(text)}")
print("Muestra:", text[:400])

[OK] ../data/wiki_corpus.csv escrito. Título: Federated learning. Longitud texto: 31699
Muestra: Federated learning (also known as collaborative learning) is a machine learning technique in a setting where multiple entities (often called clients) collaboratively train a model while keeping their data decentralized, rather than centrally stored. A defining characteristic of federated learning is data heterogeneity. Because client data is decentralized, data samples held by each client may not 


In [None]:
raw = pd.read_csv(CSV_RAW)
doc_id, title, raw_text = raw.loc[0, "id"], raw.loc[0, "title"], raw.loc[0, "text"]

# Limpieza mínima: colapsa espacios/saltos
clean = " ".join(str(raw_text).split())

# Splitter recursivo respeta separadores; solape preserva contexto entre trozos
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # ~300–350 palabras típico
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""],
)

chunks = splitter.split_text(clean)

rows = [{
    "chunk_id": f"{doc_id}::chunk_{i:04d}",
    "title": title,
    "text": ch
} for i, ch in enumerate(chunks)]

pd.DataFrame(rows).to_csv(CSV_CHUNKS, index=False, encoding="utf-8")
print(f"[OK] {len(rows)} chunks → {CSV_CHUNKS}")
print(rows[0]["text"][:300])

[OK] 37 chunks → ../data/wiki_chunks.csv
Federated learning (also known as collaborative learning) is a machine learning technique in a setting where multiple entities (often called clients) collaboratively train a model while keeping their data decentralized, rather than centrally stored. A defining characteristic of federated learning is


In [None]:
COLLECTION_NAME = "wiki_ai"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # rápido y bueno

# Carga chunks
dfc = pd.read_csv(CSV_CHUNKS)

# Cliente Chroma persistente
client = chromadb.Client(Settings(persist_directory=str(PERSIST_DIR)))

# Crea/recupera colección
existing = [c.name for c in client.list_collections()]
collection = client.get_collection(COLLECTION_NAME) if COLLECTION_NAME in existing \
    else client.create_collection(COLLECTION_NAME, metadata={"topic": TOPIC})

# Embeddings con SentenceTransformer
emb_model = SentenceTransformer(EMB_MODEL_NAME)

ids = dfc["chunk_id"].tolist()
texts = dfc["text"].tolist()
metas = [{"title": t} for t in dfc["title"].tolist()]

vectors = emb_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
assert vectors.shape[0] == len(texts)

collection.upsert(
    ids=ids,
    documents=texts,
    metadatas=metas,
    embeddings=vectors
)

print("[OK] Indexado en Chroma.")
print("Colección:", COLLECTION_NAME, "| num items:", collection.count())

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s]

[OK] Indexado en Chroma.
Colección: wiki_ai | num items: 37





In [None]:
# Configuración
PERSIST_DIR = "data/chroma"
model_embed = SentenceTransformer("all-MiniLM-L6-v2")

# Cargar base vectorial persistente
client = chromadb.Client(Settings(persist_directory=PERSIST_DIR))
collection = client.get_collection(COLLECTION_NAME)

# Formulamos la pregunta
query = "Explain federated learning challenges in healthcare."

# Embedding de la pregunta
query_vec = model_embed.encode([query])

# Recuperamos los 5 chunks más relevantes
res = collection.query(
    query_embeddings=query_vec,
    n_results=5,
    include=["documents", "metadatas"]
)

# Combinamos los fragmentos en un solo contexto
context = "\n\n".join(res["documents"][0])
print("Ejemplo de contexto recuperado:\n", context[:400])

Ejemplo de contexto recuperado:
 . Moreover, the clients involved in federated learning may be unreliable as they are subject to more failures or drop out since they commonly rely on less powerful communication media (i.e. Wi-Fi) and battery-powered systems (i.e. smartphones and IoT devices) compared to distributed learning where nodes are typically datacenters that have powerful computational capabilities and are connected to on


In [48]:
import json, os
queries = [
    "What is federated learning and how does it work?",
    "Key privacy challenges in federated learning",
    "Comparison between centralized and federated training",
    "Explain federated learning challenges in healthcare."
]

examples = []
for q in queries:
    q_vec = model_embed.encode([q])
    res = collection.query(query_embeddings=q_vec, n_results=5, include=["documents","metadatas"])
    examples.append({
        "query": q,
        "top_snippets": res["documents"][0],
        "metadata": res["metadatas"][0]
    })

os.makedirs("outputs", exist_ok=True)
with open("../outputs/retrieval_examples.json", "w", encoding="utf-8") as f:
    json.dump(examples, f, indent=2, ensure_ascii=False)

print("[OK] Guardado retrieval_examples.json")

[OK] Guardado retrieval_examples.json


In [46]:
# Modelo de lenguaje local
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)

def summarize_with_context(context, topic="Federated learning"):
    prompt = f"""
    You are a technical writer. Summarize the following information about {topic}
    in 400-500 words. Be factual, coherent, and clear.

    CONTEXT:
    {context}

    SUMMARY:
    """
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=500, do_sample=False)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Generar resumen
summary = summarize_with_context(context)

# Guardar en archivo Markdown
with open("../outputs/rag_summary.md", "w", encoding="utf-8") as f:
    f.write(f"# Summary: Federated Learning\n\n{summary.strip()}\n")

print("[OK] Guardado en outputs/rag_summary.md")
print(summary[:400])

[OK] Guardado en outputs/rag_summary.md

    You are a technical writer. Summarize the following information about Federated learning
    in 400-500 words. Be factual, coherent, and clear.

    CONTEXT:
    . Moreover, the clients involved in federated learning may be unreliable as they are subject to more failures or drop out since they commonly rely on less powerful communication media (i.e. Wi-Fi) and battery-powered systems (i.e. sm


### Reflexión: Multi-Agente vs. RAG

El enfoque **multi-agente** maneja la ambigüedad haciendo que distintos agentes colaboren y se corrijan entre sí, lo que ayuda a resolver contradicciones, aunque puede generar respuestas menos precisas.  
El enfoque **RAG** maneja mejor la **factualidad**, porque solo usa información real recuperada de la base de datos, aunque depende de qué tan buena sea la búsqueda y la cobertura del texto.  
En general, el método multi-agente es mejor para **preguntas abiertas o interpretativas**, mientras que **RAG** es más fácil y confiable para **preguntas factuales y concretas** y mucho más facil.