# PDF Layout Analysis using Layout-Parser

This notebook demonstrates how to apply layout detection to PDF documents using LayoutParser.
Adapted from the COCO layout annotations example to work with PDF files.

## Preparation

Before starting, make sure you have the required dependencies installed:

```bash
pip install pycocotools
pip install PyMuPDF  # for PDF processing
pip install layoutparser

# Install detectron2 from GitHub (recommended)
pip install git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2
```

### Alternative Installation Methods:
```bash
# Option 1: Try the bundled version first
pip install 'layoutparser[detectron2]'

# Option 2: If detectron2 fails, use paddledetection
pip install 'layoutparser[paddledetection]'

# Option 3: Manual wheel installation (if GitHub method fails)
pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html
```


In [1]:
# Import required libraries
import layoutparser as lp
import cv2
import numpy as np
import fitz  # PyMuPDF
from PIL import Image
import os
import io
from pathlib import Path
from IPython.display import Image as IPImage, display

# Try different import approaches for the model
try:
    from layoutparser.models import Detectron2LayoutModel
    MODEL_IMPORT_SUCCESS = True
    print("✅ Successfully imported Detectron2LayoutModel from layoutparser.models")
except ImportError:
    try:
        from layoutparser import Detectron2LayoutModel
        MODEL_IMPORT_SUCCESS = True
        print("✅ Successfully imported Detectron2LayoutModel from layoutparser")
    except ImportError:
        MODEL_IMPORT_SUCCESS = False
        print("⚠️  Detectron2LayoutModel not available. Will try alternative approach.")


✅ Successfully imported Detectron2LayoutModel from layoutparser.models


In [2]:
# Configuration
PDF_PATH = '../pdfs/LEY_DE_EDUCACION_DE_LA_CDMX_3.4.pdf'
OUTPUT_DIR = '../data/debug/'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Color mapping for different layout elements (adapted from COCO example)
color_map = {
    'text':   'red',
    'title':  'blue', 
    'list':   'green',
    'table':  'purple',
    'figure': 'pink',
}


## Helper Functions

Adapted from the original COCO layout example


In [4]:
def pdf_to_images(pdf_path, dpi=150):
    """
    Convert PDF pages to images.
    
    Args:
        pdf_path (str): Path to the PDF file
        dpi (int): Resolution for conversion
    
    Returns:
        list: List of numpy arrays representing images
    """
    doc = fitz.open(pdf_path)
    images = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Convert to image with specified DPI
        mat = fitz.Matrix(dpi/72, dpi/72)  # 72 is the default DPI
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to numpy array
        img_data = pix.tobytes("ppm")
        img = Image.open(io.BytesIO(img_data))
        img_array = np.array(img)
        
        # Convert RGB to BGR for OpenCV
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        images.append(img_bgr)
        
        # Save debug image
        debug_path = f'{OUTPUT_DIR}/page_{page_num + 1}_original.png'
        cv2.imwrite(debug_path, img_bgr)
        print(f"Saved page {page_num + 1} to {debug_path}")
    
    doc.close()
    return images


In [5]:
def load_coco_annotations(annotations, coco=None):
    """
    Convert COCO annotations to LayoutParser Layout objects.
    (Original function from COCO example)
    
    Args:
        annotations (List): List of COCO annotations for the current image
        coco (optional): COCO annotation object instance
    
    Returns:
        lp.Layout: LayoutParser Layout object
    """
    layout = lp.Layout()

    for ele in annotations:
        x, y, w, h = ele['bbox']

        layout.append(
            lp.TextBlock(
                block=lp.Rectangle(x, y, w+x, h+y),
                type=ele['category_id'] if coco is None else coco.cats[ele['category_id']]['name'],
                id=ele['id']
            )
        )

    return layout


In [6]:
def visualize_layout(image, layout, title="Layout Detection", save_path=None):
    """
    Visualize layout detection results.
    (Enhanced version of the COCO example visualization)
    
    Args:
        image (np.array): Input image
        layout (lp.Layout): Layout object with detected elements
        title (str): Title for the visualization
        save_path (str, optional): Path to save the visualization
    
    Returns:
        np.array: Visualization image
    """
    viz = lp.draw_box(
        image,
        [b.set(id=f'{b.type}/{b.score:.2f}' if hasattr(b, 'score') else f'{b.type}') for b in layout],
        color_map=color_map,
        show_element_id=True, 
        id_font_size=10,
        id_text_background_color='grey',
        id_text_color='white'
    )
    
    if save_path:
        cv2.imwrite(save_path, viz)
        print(f"Saved visualization to {save_path}")
    
    return viz


## Load and Convert PDF to Images


In [7]:
# Convert PDF to images
print(f"Converting PDF: {PDF_PATH}")
try:
    images = pdf_to_images(PDF_PATH, dpi=150)
    print(f"Successfully converted {len(images)} pages")
except Exception as e:
    print(f"Error converting PDF: {e}")
    images = []


Converting PDF: ../pdfs/LEY_DE_EDUCACION_DE_LA_CDMX_3.4.pdf
Saved page 1 to ../data/debug//page_1_original.png
Saved page 2 to ../data/debug//page_2_original.png
Saved page 3 to ../data/debug//page_3_original.png
Saved page 4 to ../data/debug//page_4_original.png
Saved page 5 to ../data/debug//page_5_original.png
Saved page 6 to ../data/debug//page_6_original.png
Saved page 7 to ../data/debug//page_7_original.png
Saved page 8 to ../data/debug//page_8_original.png
Saved page 9 to ../data/debug//page_9_original.png
Saved page 10 to ../data/debug//page_10_original.png
Saved page 11 to ../data/debug//page_11_original.png
Saved page 12 to ../data/debug//page_12_original.png
Saved page 13 to ../data/debug//page_13_original.png
Saved page 14 to ../data/debug//page_14_original.png
Saved page 15 to ../data/debug//page_15_original.png
Saved page 16 to ../data/debug//page_16_original.png
Saved page 17 to ../data/debug//page_17_original.png
Saved page 18 to ../data/debug//page_18_original.png
Save

In [8]:
# Check layoutparser installation and available models
print("🔍 Checking LayoutParser installation...")
print(f"LayoutParser version: {lp.__version__}")

# Check what's available in layoutparser
available_models = []
for attr in dir(lp):
    if 'Model' in attr:
        available_models.append(attr)

print(f"📋 Available model classes: {available_models}")

# Check if detectron2 backend is available
try:
    import detectron2
    print("✅ Detectron2 backend is available")
except ImportError:
    print("⚠️  Detectron2 not installed - some models may not work")
    print("   Install with: pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html")

# Check if paddlepaddle backend is available  
try:
    import paddle
    print("✅ PaddlePaddle backend is available")
except ImportError:
    print("ℹ️  PaddlePaddle not installed - alternative backend not available")


🔍 Checking LayoutParser installation...
LayoutParser version: 0.3.4
📋 Available model classes: ['AutoLayoutModel']
⚠️  Detectron2 not installed - some models may not work
   Install with: pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html
ℹ️  PaddlePaddle not installed - alternative backend not available


## Load Layout Detection Model

We'll use a pre-trained model to detect layout elements in the PDF pages.
(Same model as in the original COCO example)


In [None]:
# Load the layout detection model (from original COCO example)
if MODEL_IMPORT_SUCCESS:
    try:
        print("Loading layout detection model...")
        model = Detectron2LayoutModel(
            'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
            label_map={0: "text", 1: "title", 2: "list", 3: "table", 4: "figure"}
        )
        print("✅ Model loaded successfully!")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("💡 Trying alternative model loading approach...")
        try:
            # Alternative approach using lp.AutoLayoutModel
            model = lp.AutoLayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config')
            print("✅ Alternative model loaded successfully!")
        except Exception as e2:
            print(f"❌ Alternative approach also failed: {e2}")
            model = None
else:
    print("❌ Cannot load model - Detectron2LayoutModel not available")
    print("💡 Trying to use AutoLayoutModel instead...")
    try:
        model = lp.AutoLayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config')
        print("✅ AutoLayoutModel loaded successfully!")
    except Exception as e:
        print(f"❌ AutoLayoutModel also failed: {e}")
        print("🔧 You may need to install additional dependencies:")
        print("   pip install 'layoutparser[paddledetection]'")
        print("   or")
        print("   pip install 'layoutparser[detectron2]'")
        model = None


Loading layout detection model...
Error loading model: module layoutparser has no attribute Detectron2LayoutModel


## Process PDF Pages with Layout Detection

Apply the layout detection model to your PDF pages (similar to the COCO example)


In [None]:
# Process each page (adapted from COCO example loop)
if images and model:
    for i, image in enumerate(images[:3]):  # Process first 3 pages as example
        page_num = i + 1
        print(f"\nProcessing page {page_num}...")
        
        try:
            # Run layout detection (same as COCO example)
            layout_predicted = model.detect(image)
            
            print(f"Detected {len(layout_predicted)} layout elements")
            
            # Create visualization (enhanced from COCO example)
            viz_path = f'{OUTPUT_DIR}/page_{page_num}_layout_detected.png'
            viz = visualize_layout(
                image, 
                layout_predicted, 
                title=f"Page {page_num} Layout Detection",
                save_path=viz_path
            )
            
            # Display the result (for Jupyter)
            display(IPImage(viz_path))
            
            # Print detected elements (similar to COCO example output)
            print("\nDetected elements:")
            for j, block in enumerate(layout_predicted):
                print(f"  {j+1}. Type: {block.type}, Score: {block.score:.3f}, "
                      f"Bbox: {block.block.coordinates}")
                      
        except Exception as e:
            print(f"Error processing page {page_num}: {e}")
            
else:
    print("Cannot process pages - either images not loaded or model not available")


## Extract Text from Detected Layout Elements

Combine layout detection with text extraction for your legal documents


In [None]:
def extract_text_from_pdf_with_layout(pdf_path, layout_results):
    """
    Extract text from PDF using layout information.
    
    Args:
        pdf_path (str): Path to PDF file
        layout_results (list): List of layout detection results for each page
    
    Returns:
        dict: Organized text content by page and element type
    """
    doc = fitz.open(pdf_path)
    extracted_content = {}
    
    for page_num, layout in enumerate(layout_results):
        page = doc.load_page(page_num)
        page_content = {
            'titles': [],
            'text': [],
            'lists': [],
            'tables': [],
            'figures': []
        }
        
        for block in layout:
            # Get coordinates
            x1, y1, x2, y2 = block.block.coordinates
            
            # Create rectangle for text extraction
            rect = fitz.Rect(x1, y1, x2, y2)
            
            # Extract text from this region
            text = page.get_text("text", clip=rect).strip()
            
            if text:
                element_info = {
                    'text': text,
                    'bbox': [x1, y1, x2, y2],
                    'score': block.score if hasattr(block, 'score') else 1.0
                }
                
                # Categorize by type
                if block.type == 'title':
                    page_content['titles'].append(element_info)
                elif block.type == 'text':
                    page_content['text'].append(element_info)
                elif block.type == 'list':
                    page_content['lists'].append(element_info)
                elif block.type == 'table':
                    page_content['tables'].append(element_info)
                elif block.type == 'figure':
                    page_content['figures'].append(element_info)
        
        extracted_content[f'page_{page_num + 1}'] = page_content
    
    doc.close()
    return extracted_content


In [None]:
# Example: Extract text for the first page if we have layout results
if images and model:
    # Get layout for first page
    first_page_layout = model.detect(images[0])
    
    # Extract text with layout information
    content = extract_text_from_pdf_with_layout(PDF_PATH, [first_page_layout])
    
    # Display results
    print("\n=== EXTRACTED CONTENT FOR PAGE 1 ===")
    page_1_content = content['page_1']
    
    if page_1_content['titles']:
        print("\n📋 TITLES:")
        for i, title in enumerate(page_1_content['titles']):
            print(f"  {i+1}. {title['text'][:100]}...")
    
    if page_1_content['text']:
        print("\n📄 TEXT BLOCKS:")
        for i, text_block in enumerate(page_1_content['text'][:3]):  # Show first 3
            print(f"  {i+1}. {text_block['text'][:100]}...")
    
    if page_1_content['lists']:
        print("\n📝 LISTS:")
        for i, list_item in enumerate(page_1_content['lists']):
            print(f"  {i+1}. {list_item['text'][:100]}...")
            
    if page_1_content['tables']:
        print("\n📊 TABLES:")
        for i, table in enumerate(page_1_content['tables']):
            print(f"  {i+1}. {table['text'][:100]}...")
            
    if page_1_content['figures']:
        print("\n🖼️ FIGURES:")
        for i, figure in enumerate(page_1_content['figures']):
            print(f"  {i+1}. {figure['text'][:100]}...")
