## Gemini Document Structure Extraction

This notebook uses OpenRouter with Gemini models to extract document structure from research papers:
1. **Gemini-2.0-flash-exp**: Analyzes first 20 pages to understand document structure (element types, levels, recognition patterns)
2. **Gemini-2.0-flash-exp**: Extracts actual headers from each page using the structure understanding

References: [OpenRouter](https://openrouter.ai/), [Gemini Models](https://ai.google.dev/gemini-api)


### 1) Setup
Configure OpenRouter client, paths, and constants for document processing.


In [None]:
"""
Setup OpenRouter client and define constants for Gemini document processing.
- Keep code simple and explicit; raise if required variables are missing.
"""

### CONSTANTS ###
from pathlib import Path
PDF_PATH: Path = Path("/Users/Focus/Downloads/2212.14024v2.pdf")
STRUCTURE_MODEL: str = "google/gemini-2.5-pro"  # For document structure analysis
EXTRACTION_MODEL: str = "google/gemini-2.5-flash"  # For header extraction per page
MAX_STRUCTURE_PAGES: int = 20  # Analyze first 20 pages for structure understanding

### DEPENDENCIES ###
import os
import base64
import json
from typing import Dict, List, Any
from dotenv import load_dotenv
import openai  # OpenRouter uses OpenAI-compatible API
from pydantic import BaseModel, Field
import fitz  # PyMuPDF for PDF processing

### SETUP CLIENT ###
load_dotenv()
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
if not openrouter_key:
    raise RuntimeError("OPENROUTER_API_KEY is not set in environment.")

# Configure OpenAI client to use OpenRouter
client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openrouter_key,
)

if not PDF_PATH.exists():
    raise FileNotFoundError(f"PDF not found: {PDF_PATH}")

print("Ready. Structure model:", STRUCTURE_MODEL)
print("Extraction model:", EXTRACTION_MODEL)
print("PDF:", PDF_PATH)
print("Max pages for structure analysis:", MAX_STRUCTURE_PAGES)


### 2) PDF Processing Helpers
Helper functions to extract pages from PDF as images for processing.


In [None]:
"""
Helper functions for PDF processing and image conversion.
"""

### HELPER FUNCTIONS ###
def extract_pdf_pages_as_images(pdf_path: Path, max_pages: int = None) -> List[str]:
    """
    Extract PDF pages as base64-encoded images.
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum number of pages to extract (None for all)
        
    Returns:
        List of base64-encoded PNG images
    """
    doc = fitz.open(pdf_path)
    images = []
    
    total_pages = len(doc)
    pages_to_process = min(max_pages or total_pages, total_pages)
    
    print(f"Extracting {pages_to_process} pages from PDF...")
    
    for page_num in range(pages_to_process):
        page = doc[page_num]
        # Convert page to image (higher DPI for better text recognition)
        pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))  # 2x scaling for clarity
        img_data = pix.tobytes("png")
        
        # Convert to base64
        img_b64 = base64.b64encode(img_data).decode("utf-8")
        images.append(img_b64)
        
        if (page_num + 1) % 5 == 0:  # Progress update every 5 pages
            print(f"  Processed {page_num + 1}/{pages_to_process} pages")
    
    doc.close()
    print(f"Extracted {len(images)} page images")
    return images

def create_vision_messages(images: List[str], system_prompt: str, user_prompt: str) -> List[Dict[str, Any]]:
    """
    Create message format for vision API with multiple images.
    
    Args:
        images: List of base64-encoded images
        system_prompt: System instruction
        user_prompt: User instruction
        
    Returns:
        Messages formatted for OpenAI-compatible vision API
    """
    messages = [
        {"role": "system", "content": system_prompt}
    ]
    
    # Create user message with text and images
    content = [{"type": "text", "text": user_prompt}]
    
    # Add each image
    for i, img_b64 in enumerate(images):
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{img_b64}"
            }
        })
    
    messages.append({"role": "user", "content": content})
    return messages

# Extract all pages as images
all_page_images = extract_pdf_pages_as_images(PDF_PATH)
structure_images = all_page_images[:MAX_STRUCTURE_PAGES]  # First 20 pages for structure

print(f"Total pages: {len(all_page_images)}")
print(f"Pages for structure analysis: {len(structure_images)}")


### 3) Document Structure Analysis
Use Gemini-2.0-flash-exp to analyze the first 20 pages and understand the document structure.


In [None]:
"""
Analyze document structure using Gemini-2.0-flash-exp.
Extract element types, levels, recognition patterns, and examples.
"""

### SCHEMA ###
class DocumentElement(BaseModel):
    element_type: str = Field(..., description="Type of document element (e.g., title, section_header, subsection_header, figure_caption)")
    level: int = Field(..., description="Hierarchical level (1=highest, 6=lowest)")
    recognition_pattern: str = Field(..., description="How to identify this element (font size, style, formatting, position)")
    examples: List[str] = Field(..., description="2-3 actual examples from the document")

class DocumentStructure(BaseModel):
    document_type: str = Field(..., description="Type of research paper (e.g., conference paper, journal article, preprint)")
    elements: List[DocumentElement] = Field(..., description="All document structure elements found")
    notes: str = Field(..., description="Additional observations about the document structure")

### PROMPTS ###
STRUCTURE_SYSTEM_PROMPT = """
You are an expert in analyzing research paper document structure. You will receive the first 20 pages of a research paper and need to identify all structural elements.

Focus on identifying:
1. Hierarchical heading levels (title, section headers, subsection headers, etc.)
2. Visual recognition patterns (font sizes, styles, formatting, positioning)
3. Specific examples from the document
4. Any unique structural patterns in this paper

Be precise and detailed in your analysis. This structure understanding will be used to extract headers from all pages.
"""

STRUCTURE_USER_PROMPT = """
Please analyze the structure of this research paper. I need you to identify:

1. **Document type**: What kind of research paper is this?
2. **All structural elements**: Every type of heading/title/caption you can see
3. **Hierarchy levels**: How are elements organized (level 1, 2, 3, etc.)
4. **Recognition patterns**: How can each element be identified? (font size, bold/italic, positioning, numbering, etc.)
5. **Concrete examples**: 2-3 actual text examples for each element type

Focus on headers, titles, section names, subsection names, and any other structural text elements that organize the content.

Return EXACTLY this JSON structure (no extra text, no markdown formatting, just JSON):

{
  "document_type": "string describing type of research paper",
  "elements": [
    {
      "element_type": "main_title",
      "level": 1,
      "recognition_pattern": "description of how to identify this element",
      "examples": ["example 1", "example 2"]
    },
    {
      "element_type": "section_header",
      "level": 2,
      "recognition_pattern": "description of how to identify this element",
      "examples": ["example 1", "example 2"]
    }
  ],
  "notes": "additional observations about document structure"
}
"""

### PROCESSING ###
print("Analyzing document structure")

# Create messages for structure analysis
structure_messages = create_vision_messages(
    images=structure_images,
    system_prompt=STRUCTURE_SYSTEM_PROMPT,
    user_prompt=STRUCTURE_USER_PROMPT
)

# Call Gemini-2.0-flash-exp for structure analysis
structure_response = client.chat.completions.create(
    model=STRUCTURE_MODEL,
    messages=structure_messages,
    response_format={"type": "json_object"},
    max_tokens=4000,
    temperature=0.1
)

# Parse structure response with error handling
structure_content = structure_response.choices[0].message.content
print(f"Raw response length: {len(structure_content)} characters")

# Try to parse JSON with better error handling
try:
    structure_data = json.loads(structure_content)
except json.JSONDecodeError as e:
    print(f"JSON parsing failed at position {e.pos}: {e.msg}")
    print(f"Content around error: '{structure_content[max(0, e.pos-50):e.pos+50]}'")
    
    # Try to extract JSON from the response if it's wrapped in other text
    import re
    json_match = re.search(r'\{.*\}', structure_content, re.DOTALL)
    if json_match:
        json_content = json_match.group(0)
        print("Attempting to parse extracted JSON...")
        try:
            structure_data = json.loads(json_content)
            print("Successfully parsed extracted JSON!")
        except json.JSONDecodeError as e2:
            print(f"Extracted JSON also failed: {e2.msg}")
            print("Creating fallback structure...")
            structure_data = {
                "document_type": "research_paper",
                "elements": [
                    {
                        "element_type": "main_title",
                        "level": 1,
                        "recognition_pattern": "Large bold text at top of first page",
                        "examples": ["Document title"]
                    },
                    {
                        "element_type": "section_header", 
                        "level": 2,
                        "recognition_pattern": "Bold text with numbering (1., 2., etc.)",
                        "examples": ["1. Introduction", "2. Methods"]
                    },
                    {
                        "element_type": "subsection_header",
                        "level": 3, 
                        "recognition_pattern": "Bold text with sub-numbering (1.1, 1.2, etc.)",
                        "examples": ["1.1 Background", "2.1 Dataset"]
                    }
                ],
                "notes": "Fallback structure due to JSON parsing error"
            }
    else:
        print("No JSON found in response. Using fallback structure.")
        structure_data = {
            "document_type": "research_paper",
            "elements": [
                {
                    "element_type": "main_title",
                    "level": 1,
                    "recognition_pattern": "Large bold text at top of first page",
                    "examples": ["Document title"]
                },
                {
                    "element_type": "section_header",
                    "level": 2,
                    "recognition_pattern": "Bold text with numbering (1., 2., etc.)",
                    "examples": ["1. Introduction", "2. Methods"]
                }
            ],
            "notes": "Fallback structure due to JSON parsing error"
        }

print("\n=== DOCUMENT STRUCTURE ANALYSIS ===")
print(f"Document type: {structure_data.get('document_type', 'Unknown')}")
print(f"Notes: {structure_data.get('notes', 'None')}")
print(f"\nFound {len(structure_data.get('elements', []))} element types:")

for element in structure_data.get('elements', []):
    print(f"\n**{element['element_type']}** (Level {element['level']})")
    print(f"  Recognition: {element['recognition_pattern']}")
    print(f"  Examples:")
    for example in element.get('examples', []):
        print(f"    - {example}")

# Store for next step
document_structure = structure_data


### 4) Header Extraction Per Page
Use Gemini-2.0-flash-exp to extract actual headers from each page using the structure understanding.


In [None]:
"""
Extract headers from each page using Gemini-2.0-flash-exp and the structure understanding.
"""

### SCHEMA ###
class PageHeader(BaseModel):
    text: str = Field(..., description="The header text exactly as it appears")
    element_type: str = Field(..., description="Type of element based on structure analysis")
    level: int = Field(..., description="Hierarchical level (1=highest, 6=lowest)")
    confidence: str = Field(..., description="Confidence level: high, medium, low")

class PageHeaders(BaseModel):
    page_number: int = Field(..., description="Page number (0-indexed)")
    headers: List[PageHeader] = Field(..., description="All headers found on this page")

### HELPER FUNCTIONS ###
def create_extraction_prompt(structure_data: Dict[str, Any]) -> str:
    """
    Create a detailed prompt for header extraction based on structure analysis.
    
    Args:
        structure_data: The document structure analysis results
        
    Returns:
        Formatted prompt string
    """
    prompt = """Based on the document structure analysis, extract all headers from this page.

DOCUMENT STRUCTURE REFERENCE:
"""
    
    for element in structure_data.get('elements', []):
        prompt += f"\n**{element['element_type']}** (Level {element['level']})"
        prompt += f"\n  Recognition: {element['recognition_pattern']}"
        prompt += f"\n  Examples: {', '.join(element.get('examples', [])[:2])}"
        prompt += "\n"
    
    prompt += """
TASK:
1. Identify ALL headers on this page that match the structure patterns above
2. For each header, determine its exact text, element type, and level
3. Return results as structured JSON

Be thorough - don't miss any headers, even if they're small or seem minor.

Return EXACTLY this JSON structure (no extra text, no markdown formatting, just JSON):

{
  "headers": [
    {
      "text": "exact header text as it appears",
      "element_type": "main_title",
      "level": 1
    },
    {
      "text": "another header text",
      "element_type": "section_header",
      "level": 2
    }
  ]
}
"""
    
    return prompt

def extract_headers_from_page(page_image: str, page_num: int, extraction_prompt: str) -> Dict[str, Any]:
    """
    Extract headers from a single page using Gemini-2.0-flash-exp.
    
    Args:
        page_image: Base64-encoded page image
        page_num: Page number (0-indexed)
        extraction_prompt: Prompt for header extraction
        
    Returns:
        Dictionary with page headers
    """
    # Create messages for single page
    messages = create_vision_messages(
        images=[page_image],
        system_prompt="You are an expert at extracting headers from research paper pages. Use the provided structure analysis to identify all headers accurately.",
        user_prompt=extraction_prompt
    )
    
    # Call Gemini-2.0-flash-exp
    response = client.chat.completions.create(
        model=EXTRACTION_MODEL,
        messages=messages,
        response_format={"type": "json_object"},
        max_tokens=1500,
        temperature=0.1
    )
    
    # Parse response with error handling
    content = response.choices[0].message.content
    
    try:
        page_data = json.loads(content)
    except json.JSONDecodeError as e:
        print(f"    JSON error on page {page_num + 1}: {e.msg}")
        
        # Try to extract JSON from response
        import re
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            try:
                page_data = json.loads(json_match.group(0))
                print(f"    Successfully parsed extracted JSON for page {page_num + 1}")
            except json.JSONDecodeError:
                print(f"    Creating fallback empty result for page {page_num + 1}")
                page_data = {"headers": []}
        else:
            print(f"    No JSON found, creating empty result for page {page_num + 1}")
            page_data = {"headers": []}
    
    # Validate and fix headers structure
    if "headers" not in page_data:
        print(f"    Missing 'headers' field on page {page_num + 1}, creating empty list")
        page_data["headers"] = []
    
    # Validate each header has required fields
    valid_headers = []
    for i, header in enumerate(page_data.get("headers", [])):
        if not isinstance(header, dict):
            print(f"    Header {i+1} on page {page_num + 1} is not a dict, skipping")
            continue
            
        # Check required fields
        required_fields = ["text", "element_type", "level"]
        missing_fields = [field for field in required_fields if field not in header or header[field] is None]
        
        if missing_fields:
            print(f"    Header {i+1} on page {page_num + 1} missing fields {missing_fields}, skipping")
            continue
            
        # Validate level is an integer
        if not isinstance(header["level"], int):
            print(f"    Header {i+1} on page {page_num + 1} has invalid level '{header['level']}', skipping")
            continue
            
        valid_headers.append(header)
    
    page_data["headers"] = valid_headers
    
    # Ensure page number is set
    page_data['page_number'] = page_num
    
    return page_data

### MAIN PROCESSING ###
extraction_prompt = create_extraction_prompt(document_structure)
all_page_headers = []

total_pages = len(all_page_images)
print(f"Extracting headers from {total_pages} pages using Gemini-2.0-flash-exp...")

# Process each page
for page_num, page_image in enumerate(all_page_images):
    print(f"Processing page {page_num + 1}/{total_pages}...")
    
    try:
        page_headers = extract_headers_from_page(page_image, page_num, extraction_prompt)
        all_page_headers.append(page_headers)
        
        headers_count = len(page_headers.get('headers', []))
        print(f"  Found {headers_count} headers on page {page_num + 1}")
        
    except Exception as e:
        print(f"  Error processing page {page_num + 1}: {e}")
        # Add empty result to maintain page indexing
        all_page_headers.append({
            'page_number': page_num,
            'headers': [],
            'error': str(e)
        })
    
    # Progress update every 5 pages
    if (page_num + 1) % 5 == 0:
        total_headers = sum(len(p.get('headers', [])) for p in all_page_headers)
        print(f"  Progress: {page_num + 1}/{total_pages} pages, {total_headers} headers total")

print(f"\nCompleted header extraction from all {total_pages} pages")


### 5) Results Display
Print all extracted headers organized by page and hierarchy level.


In [None]:
"""
Display all extracted headers in a readable format.
"""

### RESULTS DISPLAY ###
print("\n" + "=" * 80)
print("                    DOCUMENT HEADER EXTRACTION RESULTS")
print("=" * 80)

# Summary statistics
total_headers = 0
headers_by_level = {}
pages_with_headers = 0

for page_data in all_page_headers:
    headers = page_data.get('headers', [])
    if headers:
        pages_with_headers += 1
    
    for header in headers:
        total_headers += 1
        level = header.get('level', 0)
        headers_by_level[level] = headers_by_level.get(level, 0) + 1

print(f"\nSUMMARY:")
print(f"  Total pages: {len(all_page_headers)}")
print(f"  Pages with headers: {pages_with_headers}")
print(f"  Total headers: {total_headers}")
print(f"  Headers by level: {dict(sorted(headers_by_level.items()))}")

print(f"\nDETAILED RESULTS:")
print("-" * 80)

# Display headers for each page
for page_data in all_page_headers:
    page_num = page_data.get('page_number', 0)
    headers = page_data.get('headers', [])
    error = page_data.get('error')
    
    if error:
        print(f"\nPage {page_num + 1}: ERROR - {error}")
        continue
    
    if not headers:
        print(f"\nPage {page_num + 1}: No headers found")
        continue
    
    print(f"\nPage {page_num + 1}: {len(headers)} headers")
    
    # Sort headers by level for better readability
    sorted_headers = sorted(headers, key=lambda h: h.get('level', 99))
    
    for header in sorted_headers:
        level = header.get('level', 0)
        element_type = header.get('element_type', 'unknown')
        text = header.get('text', '').strip()
        
        # Create indentation based on level
        indent = "  " + "  " * (level - 1) if level > 0 else "  "
        level_marker = f"H{level}" if level > 0 else "??"
        
        print(f"{indent}[{level_marker}] {text}")
        print(f"{indent}     Type: {element_type}")

print("\n" + "=" * 80)
print("                           EXTRACTION COMPLETE")
print("=" * 80)
