In [18]:
from pathlib import Path

In [19]:
from langchain_core import documents
BASE = Path("/home/pibezx/Documents/Proyectos")
PDF_DIR = BASE
PROCESSED_DIR = BASE / "data" / "processed"
INDEX_DIR = BASE / "index" / "chroma_autos"

for p in [PROCESSED_DIR, INDEX_DIR]:
    p.mkdir(parents=True, exist_ok=True)

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

def should_skip(path: Path) -> bool:
    return any(part.startswith(".") or part in SKIP_DIR_NAMES for part in path.parts)

# Descubrir PDFs
candidates = []
for p in PDF_DIR.rglob("*.pdf"):
    if should_skip(p):
        continue
    if PROCESSED_DIR in p.parents:
        continue
    candidates.append(p.resolve())

print("PDFs detectados:", len(candidates))
for i, p in enumerate(candidates, 1):
    print(f"{i:>2}. {p} | exists={p.exists()}")


PDFs detectados: 2
 1. /home/pibezx/Documents/Proyectos/Toyota/CATALOGO_COROLLA_PERU.pdf | exists=True
 2. /home/pibezx/Documents/Proyectos/Volkswagen/Ficha-Tecnica-Amarok-2025.pdf | exists=True


In [20]:
import json,os
from pathlib import Path
from typing import List
from pydantic import BaseModel
from docling.document_converter import DocumentConverter

SKIP_DIR_NAMES = {"data", "index", ".git", ".ipynb_checkpoints", ".idea", ".venv", "env", "venv"}

class ParseResult(BaseModel):
    pdf_path: str
    md_path: str
    json_meta_path: str
    chars : int
    used : str   #docling

def rel_to_base(path: Path, base: Path) -> Path:
    # Devuelve path relativo a base sin reventar si no es subpath directo
    try:
        return path.relative_to(base)
    except Exception:
        return Path(os.path.relpath(path, base))

def convert_pdf_docling(pdf_path : Path):
    conv = DocumentConverter()
    res = conv.convert(str(pdf_path))
    md_text = res.document.export_to_markdown()
    meta = res.document.as_dict() if hasattr(res.document, "as_dict") else {"note":"no-as_dict"}
    return md_text, meta

def convert_pdf(pdf_path : Path,out_md: Path, out_json: Path):
    md_text, meta = convert_pdf_docling(pdf_path)
    out_md.parent.mkdir(parents=True, exist_ok=True)
    out_md.write_text(md_text, encoding="utf-8")
    out_json.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
    return ParseResult(pdf_path=str(pdf_path), md_path=str(out_md), json_meta_path=str(out_json),
                       chars=len(md_text), used="docling")

def ingest_all(pdf_list: List[Path], processed_root: Path) -> List[ParseResult]:
    results = []
    for pdf in sorted(pdf_list):          # <<< iteramos por CADA Path
        if not pdf.exists():
            print(f"[WARN] No existe: {pdf}")
            continue
        rel = rel_to_base(pdf, PDF_DIR)   # relativo a la raíz del proyecto
        out_md = processed_root / rel.with_suffix(".md")
        out_json = processed_root / rel.with_suffix(".json")
        r = convert_pdf(pdf, out_md, out_json)
        print(f"✓ docling  {rel} -> data/processed/{rel.with_suffix('.md')} ({r.chars} chars)")
        results.append(r)
    return results


ingest_summary = ingest_all(candidates, PROCESSED_DIR)
print(f"\nListo {len(ingest_summary)} PDFs procesados.")

✓ docling  Toyota/CATALOGO_COROLLA_PERU.pdf -> data/processed/Toyota/CATALOGO_COROLLA_PERU.md (39324 chars)
✓ docling  Volkswagen/Ficha-Tecnica-Amarok-2025.pdf -> data/processed/Volkswagen/Ficha-Tecnica-Amarok-2025.md (39315 chars)

Listo 2 PDFs procesados.


In [21]:
from langchain_text_splitters import MarkdownTextSplitter

from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import torch

# Para aprovechar la estructura Markdown
markdown_splitter = MarkdownTextSplitter(chunk_size=400,chunk_overlap=100
)

#ahora viene lo chido
def load_md_documents(processed_root: Path):
    docs = []
    for md in processed_root.rglob("*.md"):
        docs.extend(TextLoader(str(md), encoding="utf-8").load())
    return docs

docs = load_md_documents(PROCESSED_DIR)
chunks = markdown_splitter.split_documents(docs)



In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create a context manager to help with GPU memory management
class GPUMemoryManager:
    def __enter__(self):
        # Nothing to do on enter
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Clean up CUDA cache on exit if using GPU
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def clear(self):
        # Manually clear GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

# Create a global instance for use throughout the notebook
gpu_memory_mgr = GPUMemoryManager()

In [23]:
# Configure tokenizer to avoid warnings
import transformers
transformers.logging.set_verbosity_error()  # Reduce warning messages

embeddings = HuggingFaceEmbeddings(
    model_name = "intfloat/multilingual-e5-base",
    model_kwargs = {"device": device}
)


In [24]:
vector_db = Chroma.from_documents(
    documents= chunks,
    embedding = embeddings,
    persist_directory=str(INDEX_DIR)
)
#vector_db.persist()
len(chunks)

283

In [25]:
from langchain_community.retrievers import BM25Retriever
from FlagEmbedding import FlagReranker

#vector_db = Chroma(persist_directory=str(INDEX_DIR), embedding_function=embeddings) #error de versiones deprecated

base_docs = load_md_documents(PROCESSED_DIR)
base_chunks = markdown_splitter.split_documents(base_docs)
bm25 = BM25Retriever.from_documents(base_chunks)
bm25.k = 12

vec_retriever = vector_db.as_retriever(search_kwargs={"k":12})

reranker = FlagReranker("BAAI/bge-reranker-v2-m3", use_fp16=(device == "cuda"))

def retrieve(query: str, topk:int=2):
    vec_docs = vec_retriever.invoke(query)
    bm_docs = bm25.invoke(query)
    # merge + dedupe por (contenido corto, fuente)
    pool, seen = [], set()
    for d in vec_docs + bm_docs:
        key = (d.page_content[:200], d.metadata.get("source"))
        if key not in seen:
            pool.append(d); seen.add(key)
    # rerank
    pairs = [[query, d.page_content] for d in pool]
    scores = reranker.compute_score(pairs, normalize=True)
    order = sorted(range(len(pool)), key=lambda i: -scores[i])
    return [pool[i] for i in order[:topk]]

In [26]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model = "llama3.2:3b")

In [27]:
from openai import OpenAI

cliente = OpenAI(api_key="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", base_url="https://openrouter.ai/api/v1")

In [34]:


def format_sources(docs):
    out = []
    for d in docs:
        src = d.metadata.get("source","")
        if src:
            try:
                src_rel = str(Path(src)).replace(str(PROCESSED_DIR)+"/", "")
            except Exception:
                src_rel = src
            out.append(src_rel)
    return list(dict.fromkeys(out))  # únicos

def ask(question: str):
    docs = retrieve(question, topk=1)
    context = "\n\n---\n\n".join(d.page_content[:3000] for d in docs)
    
    # Usar la API externa en lugar de Ollama
    try:
        response = cliente.chat.completions.create(
            model="deepseek/deepseek-r1:free",
            messages=[
                {"role": "system", "content": "Eres un asesor experto en autos para Perú y responde en Español siempre. "
                                            "Responde SOLO con el contexto proporcionado. "
                                            "Especifica versión/año si aplica. "
                                            "Si no está en el contexto, di claramente que no tienes ese dato."},
                {"role": "user", "content": f"Contexto:\n{context}\n\nPregunta: {question}"}
            ],
            temperature=0.3,
            max_tokens=800
        )
        answer = response.choices[0].message.content
    except Exception as e:
        answer = f"Error al procesar la consulta: {str(e)}"
    
    cites = format_sources(docs)
    return answer, cites


In [35]:
q = "Cuales son las medidas o dimensiones que tiene la Volkswagen Amarok? "
ans, cites = ask(q)
print(ans, "\n\nFuentes:")
for c in cites:
    print(" -", c)


Según el contexto proporcionado, las dimensiones de la Volkswagen Amarok mencionadas son:

- **Altura**: 1,455 mm  
- **Ancho**: 1,780 mm  
- **Longitud**: 4,630 mm  
- **Distancia al suelo**: 148 mm  

*Nota*: Los asteriscos (*) en "Altura" y "Distancia al suelo" están presentes en el contexto original, pero no se incluye información adicional sobre su significado. No se especifica la versión o año del modelo al que corresponden estas medidas. Si necesitas detalles técnicos adicionales (como peso, batalla, capacidad de carga, etc.), no están disponibles en el contexto compartido. 

Fuentes:
 - Toyota/CATALOGO_COROLLA_PERU.md
