In [2]:
from pathlib import Path

from langchain_core import documents

BASE = Path("/Users/angellollerena/Documents/GitHub/RAG_Cars")
PDF_DIR = BASE
PROCESSED_DIR = BASE / "data" / "processed"
INDEX_DIR = BASE / "index" / "chroma_autos"

for p in [PROCESSED_DIR, INDEX_DIR]:
    p.mkdir(parents=True, exist_ok=True)

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

def should_skip(path: Path) -> bool:
    return any(part.startswith(".") or part in SKIP_DIR_NAMES for part in path.parts)

# Descubrir PDFs
candidates = []
for p in PDF_DIR.rglob("*.pdf"):
    if should_skip(p):
        continue
    if PROCESSED_DIR in p.parents:
        continue
    candidates.append(p.resolve())

print("PDFs detectados:", len(candidates))
for i, p in enumerate(candidates, 1):
    print(f"{i:>2}. {p} | exists={p.exists()}")


PDFs detectados: 2
 1. /Users/angellollerena/Documents/GitHub/RAG_Cars/Volkswagen/Ficha-Tecnica-Amarok-2025.pdf | exists=True
 2. /Users/angellollerena/Documents/GitHub/RAG_Cars/Toyota/CATALOGO_COROLLA_PERU.pdf | exists=True


In [3]:
import json,os
from pathlib import Path
from typing import List
from pydantic import BaseModel
from docling.document_converter import DocumentConverter

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

class ParseResult(BaseModel):
    pdf_path: str
    md_path: str
    json_meta_path: str
    chars : int
    used : str   #docling

def rel_to_base(path: Path, base: Path) -> Path:
    # Devuelve path relativo a base sin reventar si no es subpath directo
    try:
        return path.relative_to(base)
    except Exception:
        return Path(os.path.relpath(path, base))

def convert_pdf_docling(pdf_path : Path):
    conv = DocumentConverter()
    res = conv.convert(str(pdf_path))
    md_text = res.document.export_to_markdown()
    meta = res.document.as_dict() if hasattr(res.document, "as_dict") else {"note":"no-as_dict"}
    return md_text, meta

def convert_pdf(pdf_path : Path,out_md: Path, out_json: Path):
    md_text, meta = convert_pdf_docling(pdf_path)
    out_md.parent.mkdir(parents=True, exist_ok=True)
    out_md.write_text(md_text, encoding="utf-8")
    out_json.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
    return ParseResult(pdf_path=str(pdf_path), md_path=str(out_md), json_meta_path=str(out_json),
                       chars=len(md_text), used="docling")

def ingest_all(pdf_list: List[Path], processed_root: Path) -> List[ParseResult]:
    results = []
    for pdf in sorted(pdf_list):          # <<< iteramos por CADA Path
        if not pdf.exists():
            print(f"[WARN] No existe: {pdf}")
            continue
        rel = rel_to_base(pdf, PDF_DIR)   # relativo a la raíz del proyecto
        out_md = processed_root / rel.with_suffix(".md")
        out_json = processed_root / rel.with_suffix(".json")
        r = convert_pdf(pdf, out_md, out_json)
        print(f"✓ docling  {rel} -> data/processed/{rel.with_suffix('.md')} ({r.chars} chars)")
        results.append(r)
    return results


ingest_summary = ingest_all(candidates, PROCESSED_DIR)
print(f"\nListo. {len(ingest_summary)} PDFs procesados.")

  from .autonotebook import tqdm as notebook_tqdm
2025-12-26 00:19:22,859 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-26 00:19:22,893 - INFO - Going to convert document batch...
2025-12-26 00:19:22,894 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2025-12-26 00:19:22,900 - INFO - Loading plugin 'docling_defaults'
2025-12-26 00:19:22,902 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-26 00:19:22,908 - INFO - Loading plugin 'docling_defaults'
2025-12-26 00:19:22,911 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-26 00:19:24,165 - INFO - Auto OCR model selected ocrmac.
2025-12-26 00:19:24,170 - INFO - Loading plugin 'docling_defaults'
2025-12-26 00:19:24,173 - INFO - Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2025-12-26 00:19:24,177 - INFO - Accelerator device: 'mps'
2025-12-26 00

✓ docling  Toyota/CATALOGO_COROLLA_PERU.pdf -> data/processed/Toyota/CATALOGO_COROLLA_PERU.md (31159 chars)


2025-12-26 00:19:47,573 - INFO - Accelerator device: 'mps'
2025-12-26 00:19:48,106 - INFO - Processing document Ficha-Tecnica-Amarok-2025.pdf
2025-12-26 00:19:58,807 - INFO - Finished converting document Ficha-Tecnica-Amarok-2025.pdf in 12.17 sec.


✓ docling  Volkswagen/Ficha-Tecnica-Amarok-2025.pdf -> data/processed/Volkswagen/Ficha-Tecnica-Amarok-2025.md (41235 chars)

Listo. 2 PDFs procesados.


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import torch

#ahora viene lo chido
def load_md_documents(processed_root: Path):
    docs = []
    for md in processed_root.rglob("*.md"):
        docs.extend(TextLoader(str(md), encoding="utf-8").load())
    return docs

docs = load_md_documents(PROCESSED_DIR)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
chunks = splitter.split_documents(docs)



In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name = "intfloat/multilingual-e5-base",
    model_kwargs = {"device": device}
)


2025-12-26 00:20:00,208 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-base


In [7]:
vector_db = Chroma.from_documents(
    documents= chunks,
    embedding = embeddings,
    persist_directory=str(INDEX_DIR)
)
#vector_db.persist()
len(chunks)

2025-12-26 00:20:25,018 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


210

In [8]:
from langchain_community.retrievers import BM25Retriever
from FlagEmbedding import FlagReranker

#vector_db = Chroma(persist_directory=str(INDEX_DIR), embedding_function=embeddings) #error de versiones deprecated

base_docs = load_md_documents(PROCESSED_DIR)
base_chunks = splitter.split_documents(base_docs)
bm25 = BM25Retriever.from_documents(base_chunks)
bm25.k = 12

vec_retriever = vector_db.as_retriever(search_kwargs={"k":12})

reranker = FlagReranker("BAAI/bge-reranker-v2-m3", use_fp16=(device == "cuda"))

def retrieve(query: str, topk:int=5):
    vec_docs = vec_retriever.get_relevant_documents(query)
    bm_docs  = bm25.get_relevant_documents(query)
    # merge + dedupe por (contenido corto, fuente)
    pool, seen = [], set()
    for d in vec_docs + bm_docs:
        key = (d.page_content[:200], d.metadata.get("source"))
        if key not in seen:
            pool.append(d); seen.add(key)
    # rerank
    pairs = [[query, d.page_content] for d in pool]
    scores = reranker.compute_score(pairs, normalize=True)
    order = sorted(range(len(pool)), key=lambda i: -scores[i])
    return [pool[i] for i in order[:topk]]

In [9]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model = "gpt-oss:20b")
llm

ChatOllama(model='gpt-oss:20b')

In [None]:
from langchain_core.prompts import ChatPromptTemplate

PROMPT = ChatPromptTemplate.from_template(
    """Eres un asesor experto en autos para Perú y responde en Español siempre.
Responde SOLO con el contexto. Especifica versión/año si aplica.
Si no está en el contexto, di claramente que no tienes ese dato.

Contexto:
{context}

Pregunta: {question}
Respuesta:"""
)

def format_sources(docs):
    out = []
    for d in docs:
        src = d.metadata.get("source","")
        if src:
            try:
                src_rel = str(Path(src)).replace(str(PROCESSED_DIR)+"/", "")
            except Exception:
                src_rel = src
            out.append(src_rel)
    return list(dict.fromkeys(out))  # únicos

def ask(question: str):
    docs = retrieve(question, topk=5)
    context = "\n\n---\n\n".join(d.page_content[:3000] for d in docs)
    msg = PROMPT.format(context=context, question=question)
    answer = llm.invoke(msg).content
    cites = format_sources(docs)
    return answer, cites


ModuleNotFoundError: No module named 'langchain.prompts'

In [None]:
q = "Cual es la capacidad total que tiene para poner GLP en un Toyota Corolla?"
ans, cites = ask(q)
print(ans, "\n\nFuentes:")
for c in cites:
    print(" -", c)


hasta 52 litros. 

Fuentes:
 - Volkswagen/Ficha-Tecnica-Amarok-2025.md
 - Toyota/CATALOGO_COROLLA_PERU.md
