# Fase 1 

In [None]:
import json
from pathlib import Path
from PIL import Image
import pytesseract
import fitz  # PyMuPDF

# ——— Configuración de rutas ———
PDF_ROOT_DIR = Path(r"C:\Users\juans\Documents\resource\historias\_01_historias")
IMG_DIR      = Path(r"C:\Users\juans\Documents\proarchitecg\version_2_docker\imagenes_por_doc")
RESULT_DIR   = Path(r"C:\Users\juans\Documents\proarchitecg\version_2_docker\ocr_por_doc")


In [None]:

# Asegúrate de apuntar a tu ejecutable de Tesseract si no está en el PATH
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

IMG_DIR.mkdir(parents=True, exist_ok=True)
RESULT_DIR.mkdir(parents=True, exist_ok=True)

def convertir_pdf_a_imagenes(pdf_path: Path, out_dir: Path) -> list[tuple[int, Path]]:
    doc = fitz.open(pdf_path)
    out_dir.mkdir(parents=True, exist_ok=True)
    rutas = []
    for i in range(len(doc)):
        pix = doc[i].get_pixmap(dpi=300)
        img_path = out_dir / f"pagina_{i+1}.png"
        pix.save(str(img_path))
        rutas.append((i+1, img_path))
    doc.close()
    return rutas

def aplicar_ocr(img_path: Path) -> str:
    try:
        texto = pytesseract.image_to_string(Image.open(img_path), lang="spa")
        return texto.strip()
    except Exception as e:
        print(f"❌ Error OCR en {img_path.name}: {e}")
        return ""

def procesar_pdf(pdf_path: Path, rel_path: Path):
    print(f"📄 Procesando {rel_path} …")
    img_out_dir = IMG_DIR / rel_path
    paginas = convertir_pdf_a_imagenes(pdf_path, img_out_dir)

    resultados = []
    for num, img_path in paginas:
        texto = aplicar_ocr(img_path)
        resultados.append({
            "pagina": num,
            "imagen": str(img_path),
            "texto": texto
        })

    json_path = RESULT_DIR / rel_path.with_suffix(".json")
    json_path.parent.mkdir(parents=True, exist_ok=True)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(resultados, f, ensure_ascii=False, indent=2)

    print(f"✅ OCR completado: {rel_path}")

# ——— Recorrer y procesar PDFs ———
for pdf_path in PDF_ROOT_DIR.rglob("*.pdf"):
    rel = pdf_path.relative_to(PDF_ROOT_DIR)
    if (RESULT_DIR / rel.with_suffix(".json")).exists():
        print(f"⏩ Ya procesado: {rel}")
        continue
    procesar_pdf(pdf_path, rel)

# ——— Mostrar un ejemplo de salida ———
json_files = list(RESULT_DIR.rglob("*.json"))
if json_files:
    muestra = json_files[0]
    print(f"\n📄 Primer resultado OCR: {muestra}")
    with open(muestra, encoding="utf-8") as f:
        datos = json.load(f)
    import pprint; pprint.pprint(datos[:2])
else:
    print(f"⚠️ No se generaron resultados en {RESULT_DIR}")
