In [None]:
import pymupdf4llm

path = "/Volumes/WD Green/dev/git/pdf_ocr/pdf-ocr/inputs/2857439.pdf"

md_text = pymupdf4llm.to_markdown(path)
print(md_text)

In [None]:
import fitz  # PyMuPDF

def reconstruct_markdown(pdf_path):
    doc = fitz.open(pdf_path)
    full_md = []

    for page in doc:
        # Récupère les blocs de texte triés (haut-gauche vers bas-droite)
        # 'sort=True' gère souvent déjà l'ordre de lecture naturel
        blocks = page.get_text("blocks", sort=True)
        
        for b in blocks:
            x0, y0, x1, y1, text, block_no, block_type = b
            
            if block_type == 0:  # C'est un bloc de texte
                clean_text = text.strip()
                if not clean_text: continue
                
                # Exemple de logique de style simple :
                # Si le bloc est très à gauche, c'est peut-être un titre ou une puce
                if x0 < 100 and len(clean_text) < 50:
                    full_md.append(f"### {clean_text}\n")
                else:
                    full_md.append(f"{clean_text}\n")
        
        full_md.append("\n---\n") # Séparateur de page
    
    return "\n".join(full_md)

md_output = reconstruct_markdown(path)
print(md_output)

In [None]:
from pdf_ocr import pdf_to_spatial_text

output = pdf_to_spatial_text("inputs/2857439.pdf")
print(output)

In [None]:
import os

for fname in sorted(os.listdir("inputs")):
    if not fname.endswith(".pdf"):
        continue
    path = os.path.join("inputs", fname)
    text = pdf_to_spatial_text(path)
    lines = text.split("\n")
    max_width = max((len(l) for l in lines), default=0)
    print(f"{fname:50s}  lines={len(lines):4d}  max_width={max_width:4d}")

## Compressed Spatial Text

`compress_spatial_text()` produces a token-efficient representation by classifying page regions (tables, headings, text blocks, key-value pairs) and rendering them as markdown tables and flowing text instead of whitespace-heavy grids.

In [None]:
from pdf_ocr import compress_spatial_text, pdf_to_spatial_text

pdf = "inputs/2857439.pdf"

spatial = pdf_to_spatial_text(pdf)
compressed = compress_spatial_text(pdf)

print(spatial)
print(compress,ed)
print(f"\n--- Compression: {len(spatial)} chars → {len(compressed)} chars ({(1 - len(compressed)/len(spatial))*100:.0f}% reduction)")

In [None]:
import os

print(f"{'File':<50} {'Spatial':>8} {'Compressed':>10} {'Reduction':>10}")
print("-" * 82)

for fname in sorted(os.listdir("inputs")):
    if not fname.endswith(".pdf"):
        continue
    path = os.path.join("inputs", fname)
    s = pdf_to_spatial_text(path)
    c = compress_spatial_text(path)
    reduction = (1 - len(c) / len(s)) * 100 if len(s) > 0 else 0
    print(f"{fname:<50} {len(s):>8} {len(c):>10} {reduction:>9.0f}%")

## Table Interpretation

`interpret_table()` takes compressed text and a **canonical schema** describing the columns your application expects, then uses an LLM pipeline to extract structured records.

The schema maps inconsistent PDF column names (e.g. "Ship Name", "Vessel", "Vessel Name") to stable canonical names via aliases. Two modes are available:

- **2-step** (`interpret_table`) — parse table structure first, then map to schema. Step 2 is **batched**: each page's parsed rows are split into chunks (default 20 rows) so the LLM produces complete output without truncation. All batches across all pages run concurrently.
- **Single-shot** (`interpret_table_single_shot`) — one LLM call per page. Faster for simple flat tables, but cannot batch and may truncate on dense pages (50+ rows).

Both modes **auto-split** multi-page input (pages joined by `\f`) and process all pages **concurrently** via `asyncio.gather()`. The return value is a `dict[int, MappedTable]` keyed by 1-indexed page number — each page gets its own complete result (records, unmapped columns, mapping notes, metadata). Records contain only canonical schema fields.

Use `to_records(result)` to flatten all pages into a single `list[dict]`, or `to_records_by_page(result)` for `{page: [dicts]}`.

### Vision-based schema inference (optional)

Some PDFs have dense tables with stacked/multi-line headers where text extraction produces **garbled or concatenated column names** (e.g. `"7:00:00 PM BUNGE"` or `"33020 WHEAT"` as single text runs). For these cases, pass `pdf_path=` to `interpret_table()` to enable a vision pre-step:

```
Step 0 (vision):  page image + compressed text → InferredTableSchema
Step 1 (guided):  compressed text + InferredTableSchema → ParsedTable
Step 2 (unchanged): ParsedTable → MappedTable
```

The vision step renders each PDF page as an image and uses a vision-capable LLM to read the correct column headers from the visual layout, then step 1 uses that schema to correctly split compound values. When `pdf_path` is omitted, the pipeline behaves exactly as before (no vision overhead).

In [None]:

# Define the canonical schema as a plain dict (e.g. loaded from a JSON file).
# CanonicalSchema.from_dict() converts it into the typed dataclass.
#
# Note: "port" has no aliases — it will be inferred from context (section headers,
# document title, or repeated contextual values) rather than matched to a column name.
schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "load_port", "type": "string", "description": "Loading port name", "aliases": ["Port"], "format": "uppercase"},
        {"name": "vessel_name", "type": "string", "description": "Name of the vessel", "aliases": ["Name of Ship"], "format": "uppercase"},
        {"name": "unique_shipping_slot_id", "type": "string", "description": "Reference number", "aliases": ["Unique Slot Reference Number"]},
        {"name": "shipper", "type": "string", "description": "Exporting company", "aliases": ["Exporter"], "format": "uppercase"},
        {"name": "commodity", "type": "string", "description": "Type of commodity", "aliases": ["Commodity"], "format": "titlecase"},
        {"name": "tons", "type": "int", "description": "Quantity in metric tonnes", "aliases": ["Quantity(tonnes)"], "format": "#,###"},
        {"name": "eta", "type": "string", "description": "Estimated time of arrival", "aliases": ["Date ETA of Ship To"], "format": "YYYY-MM-DD HH:mm"},
        {"name": "status", "type": "string", "description": "Loading status", "aliases": ["Status", "Load Status"], "format": "titlecase"},
    ],
}

schema = CanonicalSchema.from_dict(schema_dict)

print(f"Schema: {schema.description}")
print(f"Columns ({len(schema.columns)}):")
for col in schema.columns:
    print(f"  {col.name:20s}  {col.type:6s}  format={col.format or 'None':20s}  aliases={col.aliases}")


## Multi-page auto-split with batching

`compress_spatial_text()` joins pages with `\f` (form-feed). When `interpret_table()` receives multi-page input, it splits on `\f` and processes all pages **concurrently**.

Step 2 (schema mapping) is **batched** — each page's parsed rows are split into chunks of `batch_size` rows (default 20) before calling the LLM. This prevents truncation on dense pages with many data rows. All batches across all pages run concurrently via `asyncio.gather()`.

The result is a `dict[int, MappedTable]` keyed by 1-indexed page number. Each page has its own `records`, `unmapped_columns`, `mapping_notes`, and `metadata`. Use `to_records()` to flatten or `to_records_by_page()` for page-grouped dicts.

Below we run the full pipeline on `shipping-stem-2025-11-13.pdf` (3 pages, 180+ records) — no manual splitting needed.

## Vision-based interpretation (garbled-header PDFs)

The Bunge loading statement has dense stacked headers where text extraction produces concatenated spans. Passing `pdf_path=` enables the vision pipeline: each page is rendered as an image, a vision LLM infers the correct column structure, and the guided parser uses that schema to split compound values.

In [1]:
import logging
logging.basicConfig(level=logging.INFO)

from pdf_ocr import (
    # Core functions
    compress_spatial_text,
    pdf_to_spatial_text,
    # Table interpretation
    interpret_table,
    interpret_table_single_shot,
    CanonicalSchema,
    ColumnDef,
    to_records,
    to_records_by_page,
    # PDF filtering
    filter_pdf_by_table_titles,
    extract_table_titles,
    FilterMatch,
)

In [None]:
# Vision-enabled pipeline on a garbled-header PDF.
# The only difference from normal usage is pdf_path= which enables step 0 (vision).

newcastle_pdf = "inputs/2857439.pdf"
compressed_bunge = compress_spatial_text(newcastle_pdf)
print(f"Compressed chars: {len(compressed_bunge)}")
print(compressed_bunge[:500])
print("...")

# Define a schema suitable for Newcastle loading statements
newcastle_schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "load_port", "type": "string", "description": "Loading port name", "aliases": ["port"], "format": "uppercase"},
        {"name": "vessel_name", "type": "string", "description": "Name of the vessel", "aliases": ["ship name"], "format": "uppercase"},
        {"name": "unique_shipping_slot_id", "type": "string", "description": "Reference number", "aliases": ["unique slot reference number"]},
        {"name": "shipper", "type": "string", "description": "Exporting company", "aliases": ["exporter"], "format": "uppercase"},
        {"name": "commodity", "type": "string", "description": "Type of commodity", "aliases": ["commodity"], "format": "titlecase"},
        {"name": "tons", "type": "int", "description": "Quantity in metric tonnes", "aliases": ["quantity(tonnes)"], "format": "#,###"},
        {"name": "eta", "type": "string", "description": "Estimated time of arrival", "aliases": ["eta"], "format": "YYYY-MM-DD HH:mm"},
        {"name": "status", "type": "string", "description": "Loading status", "aliases": ["load status"], "format": "titlecase"},
    ],
}

newcastle_schema = CanonicalSchema.from_dict(newcastle_schema_dict)

# Run WITH vision (pdf_path= enables step 0)
result_newcastle = interpret_table(
    compressed_bunge,
    newcastle_schema,
    model="openai/gpt-4o",
    pdf_path=newcastle_pdf,
)

# Result is dict[int, MappedTable] — one entry per page
records_newcastle = to_records(result_newcastle)
print(f"Records extracted (vision): {len(records_newcastle)}")
for page, mt in sorted(result_newcastle.items()):
    print(f"Page {page}: {len(mt.records)} records, unmapped={mt.unmapped_columns}")
print(f"\n--- First 5 records ---\n")
for i, rec in enumerate(records_newcastle[:5], 1):
    print(f"[{i}] {rec}")

# Inspect per-page structure: each page has its own records, unmapped_columns, metadata
{page: mt.model_dump() for page, mt in result_newcastle.items()}


In [None]:
# Vision-enabled pipeline on the Bunge PDF (garbled stacked headers).

bunge_pdf = "inputs/Bunge_loadingstatement_2025-09-25.pdf"
compressed_bunge = compress_spatial_text(bunge_pdf)
print(f"Compressed chars: {len(compressed_bunge)}")
print(compressed_bunge[:500])
print("...")

# Define a schema suitable for Bunge loading statements
bunge_schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "load_port", "type": "string", "description": "Loading port name", "aliases": ["Port"], "format": "uppercase"},
        {"name": "vessel_name", "type": "string", "description": "Name of the vessel", "aliases": ["Name of Ship"], "format": "uppercase"},
        {"name": "unique_shipping_slot_id", "type": "string", "description": "Reference number", "aliases": ["Unique Slot Reference Number"]},
        {"name": "shipper", "type": "string", "description": "Exporting company", "aliases": ["Exporter"], "format": "uppercase"},
        {"name": "commodity", "type": "string", "description": "Type of commodity", "aliases": ["Commodity"], "format": "titlecase"},
        {"name": "tons", "type": "int", "description": "Quantity in metric tonnes", "aliases": ["Quantity(tonnes)"], "format": "#,###"},
        {"name": "eta", "type": "string", "description": "Estimated time of arrival", "aliases": ["Date ETA of Ship To"], "format": "YYYY-MM-DD HH:mm"},
        {"name": "status", "type": "string", "description": "Loading status", "aliases": ["Status", "Load Status"], "format": "titlecase"},
    ],
}

bunge_schema = CanonicalSchema.from_dict(bunge_schema_dict)

# Run WITH vision (pdf_path= enables step 0)
result_bunge = interpret_table(
    compressed_bunge,
    bunge_schema,
    model="openai/gpt-4o",
    pdf_path=bunge_pdf,
)

# Result is dict[int, MappedTable] — one entry per page
records_bunge = to_records(result_bunge)
print(f"Records extracted (vision): {len(records_bunge)}")
for page, mt in sorted(result_bunge.items()):
    print(f"Page {page}: {len(mt.records)} records, unmapped={mt.unmapped_columns}")
print(f"\n--- First 5 records ---\n")
for i, rec in enumerate(records_bunge[:5], 1):
    print(f"[{i}] {rec}")

# Inspect per-page structure: each page has its own records, unmapped_columns, metadata
{page: mt.model_dump() for page, mt in result_bunge.items()}


In [None]:
# Vision-enabled pipeline on the CBH PDF.

cbh_pdf = "inputs/CBH Shipping Stem 26092025.pdf"
compressed_cbh = compress_spatial_text(cbh_pdf)
print(f"Compressed chars: {len(compressed_cbh)}")
print(compressed_cbh[:500])
print("...")

# Define a schema suitable for CBH loading statements
cbh_schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "load_port", "type": "string", "description": "Loading port name, not captured in the original schema but present inside the header", "aliases": [], "format": "uppercase"},
        {"name": "vessel_name", "type": "string", "description": "Name of the vessel", "aliases": ["vessel name"], "format": "uppercase"},
        {"name": "unique_shipping_slot_id", "type": "string", "description": "Reference number", "aliases": ["vna #"]},
        {"name": "shipper", "type": "string", "description": "Exporting company", "aliases": ["client"], "format": "uppercase"},
        {"name": "commodity", "type": "string", "description": "Type of commodity", "aliases": ["Commodity"], "format": "titlecase"},
        {"name": "tons", "type": "int", "description": "Quantity in metric tonnes", "aliases": ["volume"], "format": "#,###"},
        {"name": "eta", "type": "string", "description": "Estimated time of arrival", "aliases": ["ETA"], "format": "YYYY-MM-DD"},
        {"name": "status", "type": "string", "description": "Loading status", "aliases": ["Status", "Loading Status"], "format": "titlecase"},
    ],
}

cbh_schema = CanonicalSchema.from_dict(cbh_schema_dict)

# Run WITH vision (pdf_path= enables step 0)
result_cbh = interpret_table(
    compressed_cbh,
    cbh_schema,
    model="openai/gpt-4o",
    pdf_path=cbh_pdf,
)

# Result is dict[int, MappedTable] — one entry per page
records_cbh = to_records(result_cbh)
print(f"Records extracted (vision): {len(records_cbh)}")
for page, mt in sorted(result_cbh.items()):
    print(f"Page {page}: {len(mt.records)} records, unmapped={mt.unmapped_columns}")
print(f"\n--- First 5 records ---\n")
for i, rec in enumerate(records_cbh[:5], 1):
    print(f"[{i}] {rec}")

# Inspect per-page structure: each page has its own records, unmapped_columns, metadata
{page: mt.model_dump() for page, mt in result_cbh.items()}


In [None]:
# Vision-enabled pipeline on the Queensland PDF.

queensland_pdf = "inputs/document (1).pdf"
compressed_queensland = compress_spatial_text(queensland_pdf)
print(f"Compressed chars: {len(compressed_queensland)}")
print(compressed_queensland[:500])
print("...")

# Define a schema suitable for Queensland loading statements
queensland_schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "load_port", "type": "string", "description": "Loading port name", "aliases": ["port"], "format": "uppercase"},
        {"name": "vessel_name", "type": "string", "description": "Name of the vessel", "aliases": ["name of ship"], "format": "uppercase"},
        {"name": "unique_shipping_slot_id", "type": "string", "description": "Reference number", "aliases": ["unique slot reference number"]},
        {"name": "shipper", "type": "string", "description": "Exporting company", "aliases": ["exporter"], "format": "uppercase"},
        {"name": "commodity", "type": "string", "description": "Type of commodity", "aliases": ["commodity"], "format": "titlecase"},
        {"name": "tons", "type": "int", "description": "Quantity in metric tonnes", "aliases": ["quantity(tonnes)"], "format": "#,###"},
        {"name": "eta", "type": "string", "description": "Estimated time of arrival", "aliases": ["date of eta of ship to"], "format": "YYYY-MM-DD HH:mm"},
        {"name": "status", "type": "string", "description": "Loading status", "aliases": ["Status", "loading ' commenced' or ' completed'"], "format": "titlecase"},
    ],
}

queensland_schema = CanonicalSchema.from_dict(queensland_schema_dict)

# Run WITH vision (pdf_path= enables step 0)
result_queensland = interpret_table(
    compressed_queensland,
    queensland_schema,
    model="openai/gpt-4o",
    pdf_path=queensland_pdf,
)

# Result is dict[int, MappedTable] — one entry per page
records_queensland = to_records(result_queensland)
print(f"Records extracted (vision): {len(records_queensland)}")
for page, mt in sorted(result_queensland.items()):
    print(f"Page {page}: {len(mt.records)} records, unmapped={mt.unmapped_columns}")
print(f"\n--- First 5 records ---\n")
for i, rec in enumerate(records_queensland[:5], 1):
    print(f"[{i}] {rec}")

# Inspect per-page structure: each page has its own records, unmapped_columns, metadata
{page: mt.model_dump() for page, mt in result_queensland.items()}


In [None]:
acea_pdf = "inputs/Press_release_car_registrations_December_2025.pdf"
acea_pdf_filtered = filter_pdf_by_table_titles(
    acea_pdf,
    ["new car registrations by market and power source, monthly"],
)


In [None]:
# Vision-enabled pipeline on the ACEA car registrations PDF.

acea_pdf = "inputs/Press_release_car_registrations_December_2025.pdf"
acea_pdf_filtered, matches = filter_pdf_by_table_titles(
    acea_pdf,
    ["new car registrations by market and power source, monthly"],
)
compressed_acea = compress_spatial_text(acea_pdf_filtered)
print(f"Compressed chars: {len(compressed_acea)}")
print(compressed_acea[:500])
print("...")

# Define a schema for ACEA car registrations
# Note: car_motorization aliases match header parts to trigger unpivot
# Note: date has empty aliases - the LLM infers year values from headers
acea_schema_dict = {
    "description": "ACEA new car registrations by market and power source, monthly",
    "columns": [
        {"name": "country", "type": "string", "description": "Country of registration", "aliases": [], "format": "titlecase"},
        {"name": "car_motorization", "type": "string", "description": "Car motorization type", "aliases": ["battery electric", "plug-in hybrid", "hybrid electric", "others", "petrol", "diesel"], "format": "titlecase"},
        {"name": "new_car_registration", "type": "int", "description": "Number of new car registrations", "aliases": [], "format": "#,###"},
        {"name": "date", "type": "string", "description": "Registration period (year from column header, month from document context)", "aliases": [], "format": "YYYY-MM"},
    ],
}

acea_schema = CanonicalSchema.from_dict(acea_schema_dict)

# Run WITH vision (pdf_path= enables step 0)
result_acea = interpret_table(
    compressed_acea,
    acea_schema,
    model="openai/gpt-4o",
    pdf_path=acea_pdf,
)

# Result is dict[int, MappedTable] — one entry per page
records_acea = to_records(result_acea)
print(f"Records extracted (vision): {len(records_acea)}")
for page, mt in sorted(result_acea.items()):
    print(f"Page {page}: {len(mt.records)} records, unmapped={mt.unmapped_columns}")
print(f"\n--- First 5 records ---\n")
for i, rec in enumerate(records_acea[:5], 1):
    print(f"[{i}] {rec}")

# Inspect per-page structure: each page has its own records, unmapped_columns, metadata
{page: mt.model_dump() for page, mt in result_acea.items()}


In [None]:
{page: mt.model_dump() for page, mt in result_acea.items()}

## Serialization

After interpreting tables, export results to CSV, TSV, Parquet, pandas or polars DataFrames using the `serialize` module. All functions validate records against the schema and coerce OCR artifacts (e.g., `"1,234"` → `1234`).

In [3]:
# Serialize interpretation results to various formats
from pdf_ocr import to_csv, to_tsv, to_pandas

# Export to CSV string
csv_str = to_csv(result_acea, acea_schema)
print("=== CSV (first 500 chars) ===")
print(csv_str[:500])
print("...")

# Export to CSV file with page column
to_csv(result_acea, acea_schema, path="/tmp/acea_output.csv", include_page=True)
print("\nWrote /tmp/acea_output.csv")

# Export to pandas DataFrame with proper nullable dtypes
df = to_pandas(result_acea, acea_schema, include_page=True)
print("\n=== pandas DataFrame ===")
print(df.head(10))
print(f"\nShape: {df.shape}")
print(f"\nDtypes:\n{df.dtypes}")

=== CSV (first 500 chars) ===
country,car_motorization,new_car_registration,date
Austria,Battery Electric,4621,2025-12
Austria,Plug-in Hybrid,2776,2025-12
Austria,Hybrid Electric,7253,2025-12
Austria,Others,0,2025-12
Austria,Petrol,5750,2025-12
Austria,Diesel,1976,2025-12
Belgium,Battery Electric,11333,2025-12
Belgium,Plug-in Hybrid,3345,2025-12
Belgium,Hybrid Electric,3591,2025-12
Belgium,Others,114,2025-12
Belgium,Petrol,9900,2025-12
Belgium,Diesel,594,2025-12
Bulgaria,Battery Electric,236,2025-12
Bulgaria,Pl
...

Wrote /tmp/acea_output.csv

=== pandas DataFrame ===
   page  country  car_motorization new_car_registration     date
0     1  Austria  Battery Electric                 4621  2025-12
1     1  Austria    Plug-in Hybrid                 2776  2025-12
2     1  Austria   Hybrid Electric                 7253  2025-12
3     1  Austria            Others                    0  2025-12
4     1  Austria            Petrol                 5750  2025-12
5     1  Austria            Diesel 

In [None]:
df.head(50)

In [None]:
from pdf_ocr.interpret import analyze_and_parse                                                                                                                                                                              
from pdf_ocr import compress_spatial_text, filter_pdf_by_table_titles                                                                                                                                                        
                                                                                                                                                                                                                            
acea_pdf = "inputs/Press_release_car_registrations_December_2025.pdf"                                                                                                                                                        
filtered, _ = filter_pdf_by_table_titles(acea_pdf, pages=[2])                                                                                                                                                                
compressed = compress_spatial_text(filtered)                                                                                                                                                                                 
                                                                                                                                                                                                                            
# Check what Step 1 outputs                                                                                                                                                                                                  
parsed = analyze_and_parse(compressed, model="openai/gpt-4o")                                                                                                                                                                
print(f"table_type: {parsed.table_type}")                                                                                                                                                                                    
print(f"headers: {parsed.headers}")                                                                                                                                                                                          
print(f"notes: {parsed.notes}")                                                                                                                                                                                              
print(f"data_rows count: {len(parsed.data_rows)}")  