In [1]:
import re
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
import layoutparser as lp
from layoutparser.elements import Rectangle
from dataclasses import dataclass
from typing import List, Dict

In [3]:
# -------------------------
# Configuraci√≥n
# -------------------------
RENDER_DPI = 200
NUM_COLUMNS_HINT = 1
MERGE_VGAP = 15

# Regex para detectar encabezados de art√≠culos
RE_ARTICULO = re.compile(
    r'^(art[√≠i]culo|art\.)\s*([0-9]+(?:\s*(bis|ter|quater))?)',
    re.IGNORECASE
)

# Layout detection using basic image processing (detectron2 alternative)
# Since detectron2 installation failed, we'll use a simpler approach
class SimpleLayoutDetector:
    def __init__(self):
        self.label_map = {0:"text",1:"title",2:"list",3:"table",4:"figure"}
    
    def detect(self, image_array):
        """
        Simple layout detection using basic image processing.
        This is a fallback when detectron2 is not available.
        """
        import cv2
        from layoutparser.elements import TextBlock, Rectangle
        
        # Convert PIL image to opencv format if needed
        if len(image_array.shape) == 3:
            gray = cv2.cvtColor(image_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = image_array
            
        # Use simple contour detection to find text regions
        # Apply threshold
        _, thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)
        
        # Find contours
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        blocks = []
        h, w = gray.shape
        
        # If no contours found, create a single block covering the whole page
        if not contours:
            blocks.append(TextBlock(
                block=Rectangle(x_1=0, y_1=0, x_2=w, y_2=h),
                type="text",
                id=0,
                score=0.9
            ))
        else:
            # Filter and process contours
            for i, contour in enumerate(contours):
                x, y, w_cont, h_cont = cv2.boundingRect(contour)
                
                # Filter out very small regions
                if w_cont > 20 and h_cont > 10:
                    blocks.append(TextBlock(
                        block=Rectangle(x_1=x, y_1=y, x_2=x+w_cont, y_2=y+h_cont),
                        type="text",
                        id=i,
                        score=0.8
                    ))
        
        # Sort by reading order (top to bottom, left to right)
        blocks.sort(key=lambda b: (b.block.y_1, b.block.x_1))
        
        return blocks

# Initialize the simple detector
detector = SimpleLayoutDetector()

In [4]:
# -------------------------
# Clases de datos
# -------------------------
@dataclass
class Parrafo:
    texto: str
    bbox: tuple  # (x1,y1,x2,y2) en coordenadas PDF

@dataclass
class Articulo:
    numero: str
    parrafos: List[Parrafo]

In [5]:
# -------------------------
# Utilidades
# -------------------------
def page_to_image(page, dpi=RENDER_DPI):
    scale = dpi / 72.0
    mat = fitz.Matrix(scale, scale)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
    return img, scale

def detect_paragraph_blocks(pil_img: Image.Image, num_columns_hint=NUM_COLUMNS_HINT):
    layout = detector.detect(np.asarray(pil_img))
    blocks = [b for b in layout if b.type in ("text", "title")]
    w, h = pil_img.size

    # Orden de lectura
    if num_columns_hint <= 1:
        blocks.sort(key=lambda b: (b.block.y_1, b.block.x_1))
    else:
        col_w = w / num_columns_hint
        cols = {}
        for b in blocks:
            cx = (b.block.x_1 + b.block.x_2) / 2
            idx = min(int(cx // col_w), num_columns_hint-1)
            cols.setdefault(idx, []).append(b)
        blocks = sum([sorted(c, key=lambda b: (b.block.y_1, b.block.x_1))
                      for _, c in sorted(cols.items())], [])

    # Fusionar verticalmente bloques cercanos
    merged = []
    for b in blocks:
        if not merged:
            merged.append(b); continue
        prev = merged[-1]
        same_col = abs(prev.block.x_1 - b.block.x_1) < 20 and abs(prev.block.x_2 - b.block.x_2) < 20
        close_v = (b.block.y_1 - prev.block.y_2) <= MERGE_VGAP
        if same_col and close_v:
            new_rect = Rectangle(
                x_1=min(prev.block.x_1, b.block.x_1),
                y_1=min(prev.block.y_1, b.block.y_1),
                x_2=max(prev.block.x_2, b.block.x_2),
                y_2=max(prev.block.y_2, b.block.y_2),
            )
            merged[-1] = lp.TextBlock(block=new_rect, type=prev.type, id=prev.id, score=max(prev.score, b.score))
        else:
            merged.append(b)
    return merged

def extract_text_by_bbox(pdf_page: fitz.Page, bbox_img_space: Rectangle, scale: float) -> str:
    rect_pdf = fitz.Rect(
        bbox_img_space.x_1 / scale,
        bbox_img_space.y_1 / scale,
        bbox_img_space.x_2 / scale,
        bbox_img_space.y_2 / scale,
    )
    txt = pdf_page.get_textbox(rect_pdf)
    txt = txt.replace('\r\n', '\n').replace('\r', '\n').strip()
    # Limpieza b√°sica: unir l√≠neas que no terminan en puntuaci√≥n fuerte
    lines = [ln.strip() for ln in txt.split("\n") if ln.strip()]
    merged = []
    for ln in lines:
        if merged and not merged[-1].endswith(tuple(".;:?!)]")):
            merged[-1] = (merged[-1] + " " + ln).strip()
        else:
            merged.append(ln)
    return " ".join(merged).strip()

In [6]:
# -------------------------
# Flujo principal
# -------------------------
def extract_paragraphs(pdf_path: str) -> List[Parrafo]:
    doc = fitz.open(pdf_path)
    all_parrafos: List[Parrafo] = []
    for i, page in enumerate(doc, start=1):
        pil_img, scale = page_to_image(page, dpi=RENDER_DPI)
        blocks = detect_paragraph_blocks(pil_img, num_columns_hint=NUM_COLUMNS_HINT)
        for b in blocks:
            txt = extract_text_by_bbox(page, b.block, scale)
            if txt:
                rect_pdf = (
                    b.block.x_1/scale, b.block.y_1/scale,
                    b.block.x_2/scale, b.block.y_2/scale
                )
                all_parrafos.append(Parrafo(texto=txt, bbox=rect_pdf))
    return all_parrafos

def group_into_articulos(parrafos: List[Parrafo]) -> List[Articulo]:
    articulos: List[Articulo] = []
    current_art = None

    for p in parrafos:
        m = RE_ARTICULO.match(p.texto)
        if m:
            # Nuevo art√≠culo
            num = m.group(2)
            current_art = Articulo(numero=num, parrafos=[p])
            articulos.append(current_art)
        elif current_art is not None:
            current_art.parrafos.append(p)
        else:
            # Texto previo a primer art√≠culo (puede ser considerandos, etc.)
            continue
    return articulos

In [7]:
# -------------------------
# Funciones para guardar resultados
# -------------------------
import json
import pandas as pd
from datetime import datetime
import os

def save_results_to_files(articulos: List[Articulo], pdf_name: str, output_dir: str = "../data/extracted"):
    """
    Guarda los resultados extra√≠dos en m√∫ltiples formatos.
    
    Args:
        articulos: Lista de art√≠culos extra√≠dos
        pdf_name: Nombre del archivo PDF procesado
        output_dir: Directorio donde guardar los archivos
    """
    # Crear directorio de salida si no existe
    os.makedirs(output_dir, exist_ok=True)
    
    # Timestamp para archivos √∫nicos
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_name = f"{pdf_name}_{timestamp}"
    
    # 1. Guardar como JSON estructurado
    json_data = {
        "metadata": {
            "source_pdf": pdf_name,
            "extraction_date": datetime.now().isoformat(),
            "total_articles": len(articulos),
            "total_paragraphs": sum(len(art.parrafos) for art in articulos)
        },
        "articles": []
    }
    
    for art in articulos:
        article_data = {
            "numero": art.numero,
            "paragraphs": []
        }
        for i, p in enumerate(art.parrafos):
            article_data["paragraphs"].append({
                "paragraph_id": i + 1,
                "text": p.texto,
                "bbox": p.bbox
            })
        json_data["articles"].append(article_data)
    
    json_path = os.path.join(output_dir, f"{base_name}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    print(f"‚úì JSON guardado: {json_path}")
    
    # 2. Guardar como CSV para an√°lisis
    rows = []
    for art in articulos:
        for i, p in enumerate(art.parrafos):
            rows.append({
                'article_number': art.numero,
                'paragraph_id': i + 1,
                'text': p.texto,
                'bbox_x1': p.bbox[0],
                'bbox_y1': p.bbox[1],
                'bbox_x2': p.bbox[2],
                'bbox_y2': p.bbox[3],
                'text_length': len(p.texto)
            })
    
    if rows:
        df = pd.DataFrame(rows)
        csv_path = os.path.join(output_dir, f"{base_name}.csv")
        df.to_csv(csv_path, index=False, encoding='utf-8')
        print(f"‚úì CSV guardado: {csv_path}")
    
    # 3. Guardar resumen en texto plano
    txt_path = os.path.join(output_dir, f"{base_name}_summary.txt")
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(f"RESUMEN DE EXTRACCI√ìN\n")
        f.write(f"==================\n\n")
        f.write(f"Archivo fuente: {pdf_name}\n")
        f.write(f"Fecha de extracci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total de art√≠culos: {len(articulos)}\n")
        f.write(f"Total de p√°rrafos: {sum(len(art.parrafos) for art in articulos)}\n\n")
        
        for art in articulos:
            f.write(f"ART√çCULO {art.numero}\n")
            f.write(f"{'='*50}\n")
            for i, p in enumerate(art.parrafos, 1):
                f.write(f"[{i}] {p.texto}\n\n")
            f.write(f"\n")
    
    print(f"‚úì Resumen guardado: {txt_path}")
    
    return {
        'json': json_path,
        'csv': csv_path,
        'summary': txt_path
    }

def debug_article_detection(parrafos: List[Parrafo], limit: int = 20):
    """
    Funci√≥n de debug para ver qu√© p√°rrafos podr√≠an ser art√≠culos.
    """
    print(f"\nüîç DEBUGGING: Primeros {limit} p√°rrafos para detectar art√≠culos:")
    print("="*60)
    
    for i, p in enumerate(parrafos[:limit]):
        # Probar diferentes patrones
        matches = []
        
        # Patr√≥n actual
        if RE_ARTICULO.search(p.texto):
            matches.append("REGEX_ACTUAL")
        
        # Patrones alternativos
        if re.search(r'\bart[√≠i]culo\s+\d+', p.texto, re.IGNORECASE):
            matches.append("ARTICULO_NUM")
        
        if re.search(r'\bart\.?\s*\d+', p.texto, re.IGNORECASE):
            matches.append("ART_NUM")
        
        if re.search(r'^\s*\d+\s*[.-]', p.texto):
            matches.append("NUM_DASH")
            
        match_str = f" [{', '.join(matches)}]" if matches else ""
        
        print(f"{i+1:2d}: {p.texto[:100]}...{match_str}")
        
    print("="*60)


In [8]:
# -------------------------
# Ejemplo de uso con guardado de resultados
# -------------------------
def process_legal_document_with_save(pdf_path, save_results=True, debug=True):
    """
    Procesa un documento legal PDF, extrae art√≠culos estructurados y guarda los resultados.
    
    Args:
        pdf_path (str): Ruta al archivo PDF
        save_results (bool): Si guardar los resultados en archivos
        debug (bool): Si mostrar informaci√≥n de debug
        
    Returns:
        tuple: (articulos, saved_files_paths)
    """
    try:
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        print(f"Procesando: {pdf_path}")
        
        # Extraer p√°rrafos
        parrafos = extract_paragraphs(pdf_path)
        print(f"‚úì Extra√≠dos {len(parrafos)} p√°rrafos")
        
        # Debug: mostrar algunos p√°rrafos para entender el formato
        if debug and parrafos:
            debug_article_detection(parrafos, limit=15)
        
        # Agrupar en art√≠culos
        articulos = group_into_articulos(parrafos)
        print(f"‚úì Identificados {len(articulos)} art√≠culos")
        
        # Si no se encontraron art√≠culos, probar con patrones alternativos
        if len(articulos) == 0 and debug:
            print("‚ö†Ô∏è  No se encontraron art√≠culos con el patr√≥n actual.")
            print("Probando patrones alternativos...")
            
            # Probar patr√≥n m√°s simple
            alternative_articles = group_into_articulos_alternative(parrafos)
            if len(alternative_articles) > 0:
                print(f"‚úì Patr√≥n alternativo encontr√≥ {len(alternative_articles)} art√≠culos")
                articulos = alternative_articles
        
        # Guardar resultados si se solicita
        saved_paths = {}
        if save_results and articulos:
            saved_paths = save_results_to_files(articulos, pdf_name)
            print(f"üìÅ Resultados guardados en {len(saved_paths)} archivos")
        elif save_results:
            print("‚ö†Ô∏è  No se guardaron resultados (no se encontraron art√≠culos)")
        
        return articulos, saved_paths
        
    except Exception as e:
        print(f"‚úó Error procesando {pdf_path}: {e}")
        import traceback
        if debug:
            traceback.print_exc()
        return [], {}

def group_into_articulos_alternative(parrafos: List[Parrafo]) -> List[Articulo]:
    """
    Funci√≥n alternativa para agrupar art√≠culos con patrones m√°s flexibles.
    """
    articulos: List[Articulo] = []
    current_art = None

    # Patrones alternativos m√°s flexibles
    patterns = [
        re.compile(r'\b(art[√≠i]culo)\s+(\d+)', re.IGNORECASE),
        re.compile(r'\b(art\.?)\s*(\d+)', re.IGNORECASE),
        re.compile(r'^(\d+)[\.\-\s]', re.MULTILINE),  # N√∫meros al inicio
    ]

    for p in parrafos:
        found_article = False
        
        for pattern in patterns:
            match = pattern.search(p.texto)
            if match:
                # Extraer n√∫mero del art√≠culo
                if len(match.groups()) >= 2:
                    num = match.group(2)
                else:
                    num = match.group(1) if match.group(1).isdigit() else "?"
                
                current_art = Articulo(numero=num, parrafos=[p])
                articulos.append(current_art)
                found_article = True
                break
        
        if not found_article and current_art is not None:
            current_art.parrafos.append(p)
        elif not found_article and current_art is None:
            # Texto previo al primer art√≠culo
            continue
    
    return articulos


In [9]:
# -------------------------
# EJECUCI√ìN PRINCIPAL CON GUARDADO
# -------------------------

# Configuraci√≥n
SAVE_RESULTS = True  # Cambiar a False si no quieres guardar archivos
DEBUG_MODE = True    # Mostrar informaci√≥n de debug

# Buscar archivos PDF en la carpeta del proyecto
project_root = "/Users/alexa/Projects/cdmx_kg"
pdf_files = []

# Buscar en m√∫ltiples ubicaciones
search_paths = [
    os.path.join(project_root, "pdfs"),
    os.path.join(project_root, "Mexico_City", "laws"),
    os.path.join(project_root, "Mexico_City", "laws_1"),
    project_root
]

for search_path in search_paths:
    if os.path.exists(search_path):
        for root, dirs, files in os.walk(search_path):
            for file in files:
                if file.endswith('.pdf'):
                    pdf_files.append(os.path.join(root, file))

if pdf_files:
    print(f"üìÑ Encontrados {len(pdf_files)} archivos PDF:")
    for i, pdf in enumerate(pdf_files[:5], 1):  # Mostrar primeros 5
        print(f"  {i}. {os.path.basename(pdf)}")
    if len(pdf_files) > 5:
        print(f"  ... y {len(pdf_files) - 5} m√°s")
    
    print(f"\nüöÄ Procesando el primer archivo con guardado autom√°tico...")
    print("="*60)
    
    # Procesar el primer PDF con guardado
    articulos, saved_files = process_legal_document_with_save(
        pdf_files[0], 
        save_results=SAVE_RESULTS,
        debug=DEBUG_MODE
    )
    
    # Mostrar resumen de resultados
    print(f"\nüìä RESUMEN FINAL:")
    print(f"‚úì Art√≠culos extra√≠dos: {len(articulos)}")
    print(f"‚úì Total de p√°rrafos: {sum(len(art.parrafos) for art in articulos)}")
    
    if saved_files:
        print(f"\nüìÅ Archivos guardados:")
        for file_type, path in saved_files.items():
            print(f"  ‚Ä¢ {file_type.upper()}: {path}")
    
    # Mostrar preview de los primeros art√≠culos
    if articulos:
        print(f"\nüìñ PREVIEW (primeros 3 art√≠culos):")
        print("="*60)
        for a in articulos[:3]:
            print(f"\nüîπ ART√çCULO {a.numero} ({len(a.parrafos)} p√°rrafos)")
            for i, p in enumerate(a.parrafos[:2], 1):  # Solo primeros 2 p√°rrafos por art√≠culo
                text_preview = p.texto[:150] + '...' if len(p.texto) > 150 else p.texto
                print(f"   [{i}] {text_preview}")
            if len(a.parrafos) > 2:
                print(f"   ... y {len(a.parrafos) - 2} p√°rrafos m√°s")
    else:
        print(f"\n‚ö†Ô∏è  No se pudieron extraer art√≠culos del documento.")
        print("Esto puede deberse a:")
        print("‚Ä¢ Formato de documento no est√°ndar")
        print("‚Ä¢ Patrones de art√≠culos diferentes")
        print("‚Ä¢ Problemas en la detecci√≥n de layout")
        
else:
    print("‚ùå No se encontraron archivos PDF en el proyecto.")
    print("Ubicaciones buscadas:")
    for path in search_paths:
        exists = "‚úì" if os.path.exists(path) else "‚úó"
        print(f"  {exists} {path}")
    print("\nPara probar el c√≥digo, coloca un archivo PDF en alguna de estas carpetas.")


üìÑ Encontrados 2 archivos PDF:
  1. LEY_DE_EDUCACION_DE_LA_CDMX_3.4.pdf
  2. LEY_DE_EDUCACION_DE_LA_CDMX_3.4.pdf

üöÄ Procesando el primer archivo con guardado autom√°tico...
Procesando: /Users/alexa/Projects/cdmx_kg/pdfs/LEY_DE_EDUCACION_DE_LA_CDMX_3.4.pdf
‚úì Extra√≠dos 251 p√°rrafos

üîç DEBUGGING: Primeros 15 p√°rrafos para detectar art√≠culos:
 1: M...
 2: LA...
 3: CAD...
 4: LA...
 5: m...
 6: m...
 7: CAC...
 8: LA...
 9: RAL...
10: CA...
11: LA...
12: RA....
13: LAU...
14: BAU...
15: LAT...
‚úì Identificados 0 art√≠culos
‚ö†Ô∏è  No se encontraron art√≠culos con el patr√≥n actual.
Probando patrones alternativos...
‚ö†Ô∏è  No se guardaron resultados (no se encontraron art√≠culos)

üìä RESUMEN FINAL:
‚úì Art√≠culos extra√≠dos: 0
‚úì Total de p√°rrafos: 0

‚ö†Ô∏è  No se pudieron extraer art√≠culos del documento.
Esto puede deberse a:
‚Ä¢ Formato de documento no est√°ndar
‚Ä¢ Patrones de art√≠culos diferentes
‚Ä¢ Problemas en la detecci√≥n de layout
