In [7]:
from pathlib import Path

from langchain_core import documents

BASE = Path("/home/pibezx/Documents/Proyectos")
PDF_DIR = BASE
PROCESSED_DIR = BASE / "data" / "processed"
INDEX_DIR = BASE / "index" / "chroma_autos"

for p in [PROCESSED_DIR, INDEX_DIR]:
    p.mkdir(parents=True, exist_ok=True)

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

def should_skip(path: Path) -> bool:
    return any(part.startswith(".") or part in SKIP_DIR_NAMES for part in path.parts)

# Descubrir PDFs
candidates = []
for p in PDF_DIR.rglob("*.pdf"):
    if should_skip(p):
        continue
    if PROCESSED_DIR in p.parents:
        continue
    candidates.append(p.resolve())

print("PDFs detectados:", len(candidates))
for i, p in enumerate(candidates, 1):
    print(f"{i:>2}. {p} | exists={p.exists()}")


PDFs detectados: 2
 1. /home/pibezx/Documents/Proyectos/Toyota/CATALOGO_COROLLA_PERU.pdf | exists=True
 2. /home/pibezx/Documents/Proyectos/Volkswagen/Ficha-Tecnica-Amarok-2025.pdf | exists=True


In [8]:
import json,os
from pathlib import Path
from typing import List
from pydantic import BaseModel
from docling.document_converter import DocumentConverter

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

class ParseResult(BaseModel):
    pdf_path: str
    md_path: str
    json_meta_path: str
    chars : int
    used : str   #docling

def rel_to_base(path: Path, base: Path) -> Path:
    # Devuelve path relativo a base sin reventar si no es subpath directo
    try:
        return path.relative_to(base)
    except Exception:
        return Path(os.path.relpath(path, base))

def convert_pdf_docling(pdf_path : Path):
    conv = DocumentConverter()
    res = conv.convert(str(pdf_path))
    md_text = res.document.export_to_markdown()
    meta = res.document.as_dict() if hasattr(res.document, "as_dict") else {"note":"no-as_dict"}
    return md_text, meta

def convert_pdf(pdf_path : Path,out_md: Path, out_json: Path):
    md_text, meta = convert_pdf_docling(pdf_path)
    out_md.parent.mkdir(parents=True, exist_ok=True)
    out_md.write_text(md_text, encoding="utf-8")
    out_json.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
    return ParseResult(pdf_path=str(pdf_path), md_path=str(out_md), json_meta_path=str(out_json),
                       chars=len(md_text), used="docling")

def ingest_all(pdf_list: List[Path], processed_root: Path) -> List[ParseResult]:
    results = []
    for pdf in sorted(pdf_list):          # <<< iteramos por CADA Path
        if not pdf.exists():
            print(f"[WARN] No existe: {pdf}")
            continue
        rel = rel_to_base(pdf, PDF_DIR)   # relativo a la raíz del proyecto
        out_md = processed_root / rel.with_suffix(".md")
        out_json = processed_root / rel.with_suffix(".json")
        r = convert_pdf(pdf, out_md, out_json)
        print(f"✓ docling  {rel} -> data/processed/{rel.with_suffix('.md')} ({r.chars} chars)")
        results.append(r)
    return results

ingest_summary = ingest_all(candidates, PROCESSED_DIR)
print(f"\nListo. {len(ingest_summary)} PDFs procesados.")


Downloading detection model, please wait. This may take several minutes depending upon your network connection.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


✓ docling  Toyota/CATALOGO_COROLLA_PERU.pdf -> data/processed/Toyota/CATALOGO_COROLLA_PERU.md (39324 chars)
✓ docling  Volkswagen/Ficha-Tecnica-Amarok-2025.pdf -> data/processed/Volkswagen/Ficha-Tecnica-Amarok-2025.md (39315 chars)

Listo. 2 PDFs procesados.


In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import torch

#ahora viene lo chido
def load_md_documents(processed_root: Path):
    docs = []
    for md in processed_root.rglob("*.md"):
        docs.extend(TextLoader(str(md), encoding="utf-8").load())
    return docs

docs = load_md_documents(PROCESSED_DIR)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
chunks = splitter.split_documents(docs)



In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
embeddings = HuggingFaceEmbeddings(
    model_name = "intfloat/multilingual-e5-base",
    model_kwargs = {"device": device}
)


In [13]:
vector_db = Chroma.from_documents(
    documents= chunks,
    embedding = embeddings,
    persist_directory=str(INDEX_DIR)
)
#vector_db.persist()
len(chunks)

238

In [15]:
from langchain_community.retrievers import BM25Retriever
from FlagEmbedding import FlagReranker

#vector_db = Chroma(persist_directory=str(INDEX_DIR), embedding_function=embeddings) #error de versiones deprecated

base_docs = load_md_documents(PROCESSED_DIR)
base_chunks = splitter.split_documents(base_docs)
bm25 = BM25Retriever.from_documents(base_chunks)
bm25.k = 12


vec_retriever = vector_db.as_retriever(search_kwargs={"k":12})

reranker = FlagReranker("BAAI/bge-reranker-v2-m3", use_fp16=(device == "cuda"))
