In [None]:
from surya.detection import batch_text_detection
from surya.layout import batch_layout_detection
from surya.model.detection.segformer import load_model, load_processor
from surya.settings import settings
from PIL import Image
import os
import io
import fitz
import matplotlib.pyplot as plt
from typing import List, Tuple
from pathlib import Path

## PDF Layout Detection and Processing Tool

A robust tool for processing PDFs to detect and extract text/layout elements using 
the Surya detection framework. The script handles multiple PDFs, extracting and analyzing
each page while maintaining a structured output format.
  
The script expects PDFs in a `paper_example` directory and creates separate output 
directories for each processed PDF.

In [None]:
# Initialize models and processors for text detection and layout recognition
det_processor = load_processor()
det_model = load_model()
rec_model = load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
rec_processor = load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)

In [None]:
def load_pdf_pages(pdf_path: str) -> List[Image.Image]:

    """
    Convert PDF pages to PIL Images for processing.
    Uses PyMuPDF to handle PDF conversion while maintaining image quality.
    """
    
    pdf_document = fitz.open(pdf_path)
    images = []
    for page in pdf_document:
        # Convert page to pixmap for high-quality rendering
        pix = page.get_pixmap()
        # Convert to PNG format for consistent image processing
        img = Image.open(io.BytesIO(pix.tobytes(output="png")))
        images.append(img)
    return images

In [None]:
def process_images(images: List[Image.Image],
                  det_model, det_processor,
                  rec_model, rec_processor,
                  output_dir: Path) -> None:
    
    """
    Two-stage pipeline for document analysis:
    1. Text region detection using detection model
    2. Layout analysis of detected regions
    
    Processes each image through both stages and saves results.
    """
    
    for i, image in enumerate(images):
        try:
            # Detect text regions in the image
            line_predictions = batch_text_detection([image], det_model, det_processor)
            
            # Analyze layout structure of detected regions
            layout_predictions = batch_layout_detection(
                [image], rec_model, rec_processor, line_predictions
            )
            layout = layout_predictions[0]
            
            # Save detected regions if bounding boxes exist
            if layout.bboxes:
                save_images(image, i, layout.bboxes, output_dir)
        except Exception as e:
            print(f"Error processing image {i}: {str(e)}")

In [None]:
def save_images(image: Image.Image, page_num: int, boxes: List[any], output_dir: Path) -> None:

    """
    Extract and save detected figures from document.
    Creates separate PNG files for each detected figure region.
    """
    
    for i, box in enumerate(boxes):
        if box.label == "Figure":
            output_path = output_dir / f'image_page{page_num}_{i}.png'
            bbox = box.bbox
            # Crop and save the figure using bounding box coordinates
            fig = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
            fig.save(output_path)

In [None]:
# Validate input directory exists
pdf_dir = Path("paper_example")
if not pdf_dir.exists():
    raise FileNotFoundError(f"PDF directory {pdf_dir} not found")

# Process all PDFs in the directory
for pdf_file in pdf_dir.glob("*.pdf"):
    try:
        # Create separate output directory for each PDF's extracted images
        output_dir = pdf_file.with_suffix('') / "cropped_images"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Process PDF pages and extract figures
        images = load_pdf_pages(str(pdf_file))
        process_images(images, det_model, det_processor,
                      rec_model, rec_processor, output_dir)
        print(f"Processed {pdf_file}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")

## Image File Cleanup

The following code removes images smaller than 4KB from specified directories.

In [None]:
def should_keep_image(image_path: Path, min_size_kb: int = 4) -> bool:
   
   """Check if image is larger than minimum size in KB"""
   
   try:
       size_kb = os.path.getsize(image_path) / 1024
       return size_kb > min_size_kb
   except Exception as e:
       print(f"Error checking size of {image_path}: {e}")
       return False

In [None]:
def cleanup_small_images(base_dir: Path):
   
   """Remove images smaller than 3KB from all subdirectories"""
   
   for pdf_dir in base_dir.glob("*/cropped_images"):
       for image_path in pdf_dir.glob("*.png"):
           if not should_keep_image(image_path):
               image_path.unlink()
               print(f"Removed {image_path}")

In [None]:
base_dir = Path("paper_example")
cleanup_small_images(base_dir)