In [None]:
gen = pipeline("text-generation", model="google/flan-t5-base")

docs = retriever.get_relevant_documents(
    "Summarize federated learning healthcare challenges and opportunities."
)
context = "\n\n".join([d.page_content[:1200] for d in docs])

prompt = (
    "Use only the provided context from Wikipedia to write a factual Markdown summary (400–500 words) "
    "with the exact headers: Introduction, Key Findings, Ethical & Technical Challenges, Conclusion.\n\n"
    f"Context:\n{context}\n\nWrite the summary now:"
)

out = gen(prompt, max_length=950, do_sample=False)
summary_md = out[0]["generated_text"]

with open(OUTPUT_DIR / "rag_summary.md", "w", encoding="utf-8") as f:
    f.write(summary_md)

print("Resumen guardado → outputs/rag_summary.md")


In [None]:
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma(
    collection_name="wiki_ai",
    embedding_function=hf_embeddings,
    persist_directory=str(DATA_DIR / "chroma_db")
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

query_examples = [
    "Explain federated learning challenges in healthcare.",
    "List key privacy risks in federated learning.",
    "How does federated learning differ from centralized training?"
]

retrieval_results = {}
for q in query_examples:
    docs = retriever.get_relevant_documents(q)
    retrieval_results[q] = [
        {"source": d.metadata, "snippet": d.page_content[:500]}
        for d in docs
    ]

with open(OUTPUT_DIR / "retrieval_examples.json", "w", encoding="utf-8") as f:
    json.dump(retrieval_results, f, indent=2, ensure_ascii=False)

print("Ejemplos de recuperación guardados → outputs/retrieval_examples.json")


In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

client = chromadb.Client(Settings(
    persist_directory=str(DATA_DIR / "chroma_db"),
    chroma_db_impl="duckdb+parquet"
))
collection = client.create_collection("wiki_ai", metadata={"hnsw:space": "cosine"})

ids = df["id"].tolist()
metadatas = [{"title": t} for t in df["title"].tolist()]
documents = df["text"].tolist()

collection.upsert(documents=documents, metadatas=metadatas, ids=ids)
client.persist()
print(f"{len(documents)} documentos insertados en ChromaDB → data/chroma_db")


In [None]:
wiki = wikipediaapi.Wikipedia('en')
page = wiki.page(PAGE_TITLE)
if not page.exists():
    raise ValueError(f"Wikipedia page '{PAGE_TITLE}' not found.")

text = page.text.strip()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=150, separators=["\n\n", "\n", ". "]
)
chunks = splitter.split_text(text)

records = [{"id": f"{PAGE_TITLE}_{i+1}", "title": PAGE_TITLE, "text": chunk}
           for i, chunk in enumerate(chunks)]
df = pd.DataFrame(records)

df.to_csv(DATA_DIR / "wiki_corpus.csv", index=False, encoding="utf-8")
print(f"Corpus guardado con {len(df)} chunks → data/wiki_corpus.csv")


In [None]:
import os
import json
import pandas as pd
from pathlib import Path

import wikipediaapi
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import pipeline

# Forzar uso de CPU (evita errores de DLL)
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Rutas
DATA_DIR = Path("../data")
OUTPUT_DIR = Path("../outputs")
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TOPIC = "Federated learning"
PAGE_TITLE = "Federated_learning"
