In [7]:
from surya.detection import batch_text_detection
from surya.layout import batch_layout_detection
from surya.model.detection.segformer import load_model, load_processor
from surya.settings import settings
from PIL import Image
import os
import io
import fitz
import matplotlib.pyplot as plt
from typing import List, Tuple
from pathlib import Path

## PDF Layout Detection and Processing Tool

A robust tool for processing PDFs to detect and extract text/layout elements using 
the Surya detection framework. The script handles multiple PDFs, extracting and analyzing
each page while maintaining a structured output format.
  
The script expects PDFs in a `paper_example` directory and creates separate output 
directories for each processed PDF.

In [8]:
# Initialize models and processors for text detection and layout recognition
det_processor = load_processor()
det_model = load_model()
rec_model = load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
rec_processor = load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)

Loaded detection model vikp/surya_det2 on device cpu with dtype torch.float32
Loaded detection model vikp/surya_layout2 on device cpu with dtype torch.float32


In [9]:
def load_pdf_pages(pdf_path: str) -> List[Image.Image]:

    """
    Convert PDF pages to PIL Images for processing.
    Uses PyMuPDF to handle PDF conversion while maintaining image quality.
    """
    
    pdf_document = fitz.open(pdf_path)
    images = []
    for page in pdf_document:
        # Convert page to pixmap for high-quality rendering
        pix = page.get_pixmap()
        # Convert to PNG format for consistent image processing
        img = Image.open(io.BytesIO(pix.tobytes(output="png")))
        images.append(img)
    return images

In [None]:
def process_images(images: List[Image.Image],
                  det_model, det_processor,
                  rec_model, rec_processor,
                  output_dir: Path) -> None:
    
    """
    Two-stage pipeline for document analysis:
    1. Text region detection using detection model
    2. Layout analysis of detected regions
    
    Processes each image through both stages and saves results.
    """
    
    for i, image in enumerate(images):
        try:
            # Detect text regions in the image
            line_predictions = batch_text_detection([image], det_model, det_processor)
            
            # Analyze layout structure of detected regions
            layout_predictions = batch_layout_detection(
                [image], rec_model, rec_processor, line_predictions
            )
            layout = layout_predictions[0]
            
            # Save detected regions if bounding boxes exist
            if layout.bboxes:
                save_images(image, i, layout.bboxes, output_dir)
        except Exception as e:
            print(f"Error processing image {i}: {str(e)}")

In [11]:
def save_images(image: Image.Image, page_num: int, boxes: List[any], output_dir: Path) -> None:

    """
    Extract and save detected figures from document.
    Creates separate PNG files for each detected figure region.
    """
    
    for i, box in enumerate(boxes):
        if box.label == "Figure":
            output_path = output_dir / f'image_page{page_num}_{i}.png'
            bbox = box.bbox
            # Crop and save the figure using bounding box coordinates
            fig = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
            fig.save(output_path)

In [12]:
# Validate input directory exists
pdf_dir = Path("paper_example")
if not pdf_dir.exists():
    raise FileNotFoundError(f"PDF directory {pdf_dir} not found")

# Process all PDFs in the directory
for pdf_file in pdf_dir.glob("*.pdf"):
    try:
        # Create separate output directory for each PDF's extracted images
        output_dir = pdf_file.with_suffix('') / "cropped_images"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Process PDF pages and extract figures
        images = load_pdf_pages(str(pdf_file))
        process_images(images, det_model, det_processor,
                      rec_model, rec_processor, output_dir)
        print(f"Processed {pdf_file}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")

Detecting bboxes: 100%|██████████| 1/1 [00:15<00:00, 15.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.77s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:11<00:00, 11.86s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:13<00:00, 13.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.01s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:14<00:00, 14.94s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:09<00:00,  9.12s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:14<00:00, 14.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.48s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:14<00:00, 14.97s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.34s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.10s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/9.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:14<00:00, 14.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:10<00:00, 10.30s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:13<00:00, 13.50s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.59s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:16<00:00, 16.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.63s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:14<00:00, 14.18s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.58s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.87s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.23s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:13<00:00, 13.68s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.26s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:13<00:00, 13.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.25s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/8.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.16s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.50s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:13<00:00, 13.50s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:10<00:00, 10.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.04s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.92s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.42s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.67s/it]


Processed paper_example/16.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.07s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.78s/it]


Processed paper_example/17.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.21s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.11s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/15.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.06s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/14.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.52s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.45s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.63s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/10.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.30s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.91s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Processed paper_example/11.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/13.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.14s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.04s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.27s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.07s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]


Processed paper_example/12.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.77s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.58s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.27s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.36s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Processed paper_example/20.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.17s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.03s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.27s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.36s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.84s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]


Processed paper_example/21.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.85s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.22s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.84s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.14s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.17s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]


Processed paper_example/19.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.18s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.92s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.13s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.11s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


Processed paper_example/18.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.10s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.78s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.90s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.78s/it]


Processed paper_example/6.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.60s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
Detecting bboxes: 100%|██████████| 1/1 [

Processed paper_example/7.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.63s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]
Detecting bboxes: 100%|██████████| 1/1 [04:14<00:00, 254.61s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:34<00:00, 1054.63s/it]
Detecting bboxes: 100%|██████████| 1/1 [15:37<00:00, 937.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [06:10<00:00, 370.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:32<00:00, 1052.19s/it]
Detecting bboxes: 100%|██████████| 1/1 [15:41<00:00, 941.13s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:39<00:00, 999.17s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:29<00:00, 989.87s/it]
Detecting bboxes: 100%|██████████| 1/1 [12:15<00:00, 735.80s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:26<00:00, 986.95s/it]
Detecting bboxes: 100%|██████████| 1/1 [15:27<00:00, 927.94s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:57<00:00, 1077.52s/it]


Processed paper_example/5.pdf


Detecting bboxes: 100%|██████████| 1/1 [11:42<00:00, 702.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:59<00:00, 59.31s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:43<00:00, 1003.37s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:55<00:00, 1075.18s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:47<00:00, 1007.41s/it]
Detecting bboxes: 100%|██████████| 1/1 [11:41<00:00, 701.48s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:47<00:00, 1067.22s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.56s/it]
Detecting bboxes: 100%|██████████| 1/1 [32:44<00:00, 1964.27s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [32:06<00:00, 1926.70s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.31s/it]
Detecting bboxes: 100%|██████████| 1/1 [15:47<00:00, 947.14s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:20<00:00, 1040.08s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:15<00:00, 975.26s/it]
Detecting bboxes: 100%

Processed paper_example/4.pdf


Detecting bboxes: 100%|██████████| 1/1 [10:32<00:00, 632.85s/it]
Detecting bboxes: 100%|██████████| 1/1 [10:23<00:00, 623.42s/it]
Detecting bboxes: 100%|██████████| 1/1 [11:21<00:00, 681.45s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]
Detecting bboxes: 100%|██████████| 1/1 [09:40<00:00, 580.85s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:11<00:00, 1031.98s/it]
Detecting bboxes: 100%|██████████| 1/1 [05:28<00:00, 328.09s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:14<00:00, 1034.40s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:10<00:00, 1030.36s/it]
Detecting bboxes: 100%|██████████| 1/1 [17:20<00:00, 1040.30s/it]
Detecting bboxes: 100%|██████████| 1/1 [16:25<00:00, 985.89s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.03s/it]
Detecting bboxes: 100%|███

Processed paper_example/1.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.78s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00,  5.36s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:12<00:00, 12.00s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:10<00:00, 10.41s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:11<00:00, 11.38s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.90s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.85s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.85s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.07s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it]


Processed paper_example/3.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.62s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:03<00:00,  3.98s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.91s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.40s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.46s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.76s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


Processed paper_example/2.pdf


## Image File Cleanup

The following code removes images smaller than 4KB from specified directories.

In [16]:
def should_keep_image(image_path: Path, min_size_kb: int = 4) -> bool:
   
   """Check if image is larger than minimum size in KB"""
   
   try:
       size_kb = os.path.getsize(image_path) / 1024
       return size_kb > min_size_kb
   except Exception as e:
       print(f"Error checking size of {image_path}: {e}")
       return False

In [17]:
def cleanup_small_images(base_dir: Path):
   
   """Remove images smaller than 3KB from all subdirectories"""
   
   for pdf_dir in base_dir.glob("*/cropped_images"):
       for image_path in pdf_dir.glob("*.png"):
           if not should_keep_image(image_path):
               image_path.unlink()
               print(f"Removed {image_path}")

In [18]:
base_dir = Path("paper_example")
cleanup_small_images(base_dir)

Removed paper_example/6/cropped_images/image_page0_6.png
Removed paper_example/10/cropped_images/image_page0_8.png
Removed paper_example/19/cropped_images/image_page0_6.png
Removed paper_example/3/cropped_images/image_page0_6.png
