In [1]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

In [2]:
df = pd.read_csv("..\data\processed\Articulos_LLM.csv")

In [3]:
docs = [
    Document(
        page_content=row["contenido"],
        metadata={
            "titulo": row["titulo"],
            "url": row["url"],
            "precio": row.get("Precios", ""),
            "fecha": row.get("fechas", ""),
            "contexto_fecha":row.get("fechas_contexto",""),
            "edad": row.get("edad", ""),
        }
    )
    for _, row in df.iterrows()
]


In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " "]
)

# Crear los Document chunked
chunked_docs = []

for _, row in df.iterrows():
    texto = row["contenido"]
    if pd.isnull(texto):
        continue
    chunks = splitter.split_text(texto)
    for i, chunk in enumerate(chunks):
        chunked_docs.append(Document(
            page_content=chunk,
            metadata={
            "titulo": row["titulo"],
            "url": row["url"],
            "precio": row.get("Precios", ""),
            "fecha": row.get("fechas", ""),
            "contexto_fecha":row.get("fechas_contexto",""),
            "edad": row.get("edad", ""),
        }
    )
        )

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Vectorstore
vectorstore = FAISS.from_documents(chunked_docs, embedding_model)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

# Modelo local desde Hugging Face
model_name = "microsoft/Phi-3-mini-4k-instruct"
llm = HuggingFaceEndpoint(
    repo_id=model_name,
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

In [8]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain.schema import Document

# Estado del agente
class AgentState(TypedDict):
    query: str
    documents: List[Document]
    generation: str

# Paso retrieval
def retrieve(state):
    query = state["query"]
    docs = vectorstore.similarity_search(query, k=10)
    return {"documents": docs}

# Paso generación usando contexto
def generate(state):
    docs = state["documents"]
    query = state["query"]
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = f"""Contexto:\n{context}\n\nPregunta: {query}\nRespuesta breve:"""

    respuesta = chat_model.invoke(prompt)
    return {"generation": respuesta}

# Grafo del agente
graph = StateGraph(AgentState)
graph.add_node("retrieve", retrieve)
graph.add_node("generate", generate)
graph.set_entry_point("retrieve")
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

# Compilar
rag_agent = graph.compile()

In [9]:
# respuesta = rag_agent.invoke({"query": "¿Qué actividades gratuitas hay este fin de semana?"})
# print(respuesta["generation"])