Conversion de PDF a imagenes 
    

In [3]:
import os
from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_path, output_dir, dpi=400):
    """
    Convierte todas las páginas de un PDF a imágenes PNG.
    Args:
        pdf_path (str): Ruta al archivo PDF.
        output_dir (str): Carpeta donde se guardarán las imágenes.
        dpi (int): Resolución para la conversión. 300 suele ser un buen valor.
    Returns:
        List[str]: Lista de rutas a las imágenes generadas.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # pages será una lista de PIL Images
    pages = convert_from_path(pdf_path, dpi=dpi)

    image_paths = []
    for i, page in enumerate(pages):
        image_name = f"page_{i+1}.png"
        out_path = os.path.join(output_dir, image_name)
        # Guardar la imagen en PNG
        page.save(out_path, "PNG")
        image_paths.append(out_path)

    return image_paths


Carga el modelo de TATR de Hugging Face

In [4]:
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForObjectDetection

model_name = "microsoft/table-transformer-detection"
processor = AutoImageProcessor.from_pretrained(model_name, revision = "no_timm")  #no_timm se usa para evitar problemas 
model = AutoModelForObjectDetection.from_pretrained(model_name, revision = "no_timm")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


TableTransformerForObjectDetection(
  (model): TableTransformerModel(
    (backbone): TableTransformerConvModel(
      (conv_encoder): TableTransformerConvEncoder(
        (model): ResNetBackbone(
          (embedder): ResNetEmbeddings(
            (embedder): ResNetConvLayer(
              (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
              (normalization): TableTransformerFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (encoder): ResNetEncoder(
            (stages): ModuleList(
              (0): ResNetStage(
                (layers): Sequential(
                  (0): ResNetBasicLayer(
                    (shortcut): Identity()
                    (layer): Sequential(
                      (0): ResNetConvLayer(
                        (convolution): Conv2d(64, 64, kernel_size=(3, 3), stride=(

Deteccion de tablas en cada imagen

In [5]:
import numpy as np

def detect_tables_in_image(image_path, confidence_threshold=0.3):
    """
    Detecta tablas en una imagen usando Table Transformer (detección).
    Args:
        image_path (str): Ruta a la imagen PNG/JPG.
        confidence_threshold (float): Umbral de confianza para filtrar predicciones.
    Returns:
        List[dict]: Lista de tablas detectadas, cada dict contiene 'score' y 'box' con coords (xmin, ymin, xmax, ymax).
    """
    image = Image.open(image_path).convert("RGB")
    # Preprocesar la imagen
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
    # Postprocesar
    result = processor.post_process_object_detection(outputs, threshold=confidence_threshold)[0]

    detections = []
    for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
        if label == 1:  # 1 es la clase 'table' en este modelo
            # box es [xmin, ymin, xmax, ymax]
            det = {
                "score": float(score.cpu().numpy()),
                "box": [float(x) for x in box.cpu().numpy()]
            }
            detections.append(det)
    return detections


In [6]:
def crop_table_images(image_path, detections, output_dir):
    """
    Recorta y guarda las regiones de tabla detectadas en 'detections'.
    Args:
        image_path (str): Ruta a la imagen original.
        detections (List[dict]): Lista de tablas con su 'box'.
        output_dir (str): Carpeta donde guardar los recortes.
    Returns:
        List[str]: Rutas de las imágenes recortadas de cada tabla.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image = Image.open(image_path).convert("RGB")
    table_paths = []
    for i, det in enumerate(detections):
        box = det["box"]  # [xmin, ymin, xmax, ymax]
        xmin, ymin, xmax, ymax = box
        # Recortar
        cropped = image.crop((xmin, ymin, xmax, ymax))
        table_path = os.path.join(output_dir, f"table_{i+1}.png")
        cropped.save(table_path, "PNG")
        table_paths.append(table_path)
    return table_paths


In [7]:
def process_pdf_for_tables(pdf_path, base_output_dir="output_tables", dpi=400):
    """
    Convierte PDF a imágenes, detecta tablas en cada página y recorta.
    """
    # 1) Convertir a imágenes
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pages_dir = os.path.join(base_output_dir, pdf_name, "pages")
    tables_dir = os.path.join(base_output_dir, pdf_name, "tables")

    page_images = convert_pdf_to_images(pdf_path, pages_dir, dpi=dpi)

    all_tables = []  # Para almacenar info de cada tabla
    for page_image in page_images:
        # 2) Detectar tablas
        detections = detect_tables_in_image(page_image, confidence_threshold=0.3)
        # 3) Recortar cada tabla detectada
        page_id = os.path.splitext(os.path.basename(page_image))[0]  # "page_1"
        page_tables_dir = os.path.join(tables_dir, page_id)
        table_paths = crop_table_images(page_image, detections, page_tables_dir)

        # Podrías almacenar metadatos (por ejemplo, la bounding box, etc.)
        for tpath, det in zip(table_paths, detections):
            all_tables.append({
                "pdf_name": pdf_name,
                "page": page_id,
                "score": det["score"],
                "box": det["box"],
                "table_image": tpath
            })

    return all_tables

# Ejemplo de ejecución
if __name__ == "__main__":
    pdf_file = "/home/pibezx/Documents/Proyectos/PaginaWeb_Automoviles/Chatbot_Cars/Mazda" 
    result_tables = process_pdf_for_tables(pdf_file, base_output_dir="l200_output", dpi=400)
    print("Tablas detectadas:", result_tables)


PDFPageCountError: Unable to get page count.
Syntax Warning: May not be a PDF file (continuing anyway)
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table


In [None]:
import os
from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModelForObjectDetection

# Configuración inicial del modelo (mantenemos esto igual)
model_name = "microsoft/table-transformer-detection"
processor = AutoImageProcessor.from_pretrained(model_name, revision="no_timm")
model = AutoModelForObjectDetection.from_pretrained(model_name, revision="no_timm")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Función para convertir PDF a imágenes (la misma)
def convert_pdf_to_images(pdf_path, output_dir, dpi=400):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pages = convert_from_path(pdf_path, dpi=dpi)
    image_paths = []
    for i, page in enumerate(pages):
        image_name = f"page_{i+1}.png"
        out_path = os.path.join(output_dir, image_name)
        page.save(out_path, "PNG")
        image_paths.append(out_path)
    return image_paths

# Función para detectar tablas (la misma)
def detect_tables_in_image(image_path, confidence_threshold=0.3):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    result = processor.post_process_object_detection(outputs, threshold=confidence_threshold)[0]
    detections = []
    for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
        if label == 1:
            detections.append({
                "score": float(score.cpu().numpy()),
                "box": [float(x) for x in box.cpu().numpy()]
            })
    return detections

# Función para recortar tablas (la misma)
def crop_table_images(image_path, detections, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    image = Image.open(image_path).convert("RGB")
    table_paths = []
    for i, det in enumerate(detections):
        xmin, ymin, xmax, ymax = det["box"]
        cropped = image.crop((xmin, ymin, xmax, ymax))
        table_path = os.path.join(output_dir, f"table_{i+1}.png")
        cropped.save(table_path, "PNG")
        table_paths.append(table_path)
    return table_paths

# Función modificada para procesar múltiples PDFs
def process_all_pdfs(root_dir, base_output_dir="output_tables", dpi=400):
    all_tables = []
    
    # Recorrer recursivamente todos los directorios
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                try:
                    print(f"Procesando: {pdf_path}")
                    # Procesar cada PDF
                    pdf_name = os.path.splitext(file)[0]
                    pages_dir = os.path.join(base_output_dir, pdf_name, "pages")
                    tables_dir = os.path.join(base_output_dir, pdf_name, "tables")
                    
                    # Convertir a imágenes
                    page_images = convert_pdf_to_images(pdf_path, pages_dir, dpi)
                    
                    # Procesar cada página
                    for page_image in page_images:
                        detections = detect_tables_in_image(page_image)
                        page_id = os.path.splitext(os.path.basename(page_image))[0]
                        page_tables_dir = os.path.join(tables_dir, page_id)
                        table_paths = crop_table_images(page_image, detections, page_tables_dir)
                        
                        # Guardar metadatos
                        for tpath, det in zip(table_paths, detections):
                            all_tables.append({
                                "pdf_name": pdf_name,
                                "page": page_id,
                                "table_image": tpath,
                                "metadata": det
                            })
                            
                except Exception as e:
                    print(f"Error procesando {pdf_path}: {str(e)}")
    
    return all_tables

# Ejecución principal
if __name__ == "__main__":
    # Especificar el directorio raíz que contiene tus carpetas con PDFs
    root_directory = "/home/pibezx/Documents/Proyectos/PaginaWeb_Automoviles/Chatbot_Cars"
    
    # Procesar todos los PDFs
    results = process_all_pdfs(
        root_dir=root_directory,
        base_output_dir="pdf_a_images",
        dpi=400
    )
    
    print(f"Procesamiento completado. Tablas detectadas: {len(results)}")