In [None]:
import pymupdf4llm

path = "/Volumes/WD Green/dev/git/pdf_ocr/pdf-ocr/inputs/2857439.pdf"

md_text = pymupdf4llm.to_markdown(path)
print(md_text)

In [None]:
import fitz  # PyMuPDF

def reconstruct_markdown(pdf_path):
    doc = fitz.open(pdf_path)
    full_md = []

    for page in doc:
        # Récupère les blocs de texte triés (haut-gauche vers bas-droite)
        # 'sort=True' gère souvent déjà l'ordre de lecture naturel
        blocks = page.get_text("blocks", sort=True)
        
        for b in blocks:
            x0, y0, x1, y1, text, block_no, block_type = b
            
            if block_type == 0:  # C'est un bloc de texte
                clean_text = text.strip()
                if not clean_text: continue
                
                # Exemple de logique de style simple :
                # Si le bloc est très à gauche, c'est peut-être un titre ou une puce
                if x0 < 100 and len(clean_text) < 50:
                    full_md.append(f"### {clean_text}\n")
                else:
                    full_md.append(f"{clean_text}\n")
        
        full_md.append("\n---\n") # Séparateur de page
    
    return "\n".join(full_md)

md_output = reconstruct_markdown(path)
print(md_output)

In [None]:
from pdf_ocr import pdf_to_spatial_text

output = pdf_to_spatial_text("inputs/2857439.pdf")
print(output)

In [None]:
import os

for fname in sorted(os.listdir("inputs")):
    if not fname.endswith(".pdf"):
        continue
    path = os.path.join("inputs", fname)
    text = pdf_to_spatial_text(path)
    lines = text.split("\n")
    max_width = max((len(l) for l in lines), default=0)
    print(f"{fname:50s}  lines={len(lines):4d}  max_width={max_width:4d}")

## Compressed Spatial Text

`compress_spatial_text()` produces a token-efficient representation by classifying page regions (tables, headings, text blocks, key-value pairs) and rendering them as markdown tables and flowing text instead of whitespace-heavy grids.

In [None]:
from pdf_ocr import compress_spatial_text, pdf_to_spatial_text

pdf = "inputs/2857439.pdf"

spatial = pdf_to_spatial_text(pdf)
compressed = compress_spatial_text(pdf)

print(spatial)
print(compress,ed)
print(f"\n--- Compression: {len(spatial)} chars → {len(compressed)} chars ({(1 - len(compressed)/len(spatial))*100:.0f}% reduction)")

                                                                               Shipping Stem Report
                                                                                  Date Generated: 15/09/2025
                                                                                                                Date of    Date of
                                                                                                     Quantity
   Port             Ship Name              Ref #             Exporter               Commodity                 Nomination  Nomination   ETA        ETB         ETS         Load Status
                                                                                                     (tonnes)
                                                                                                               Received   Accepted
                                                                                                             10/07/2025 10/07/2

In [None]:
import os

print(f"{'File':<50} {'Spatial':>8} {'Compressed':>10} {'Reduction':>10}")
print("-" * 82)

for fname in sorted(os.listdir("inputs")):
    if not fname.endswith(".pdf"):
        continue
    path = os.path.join("inputs", fname)
    s = pdf_to_spatial_text(path)
    c = compress_spatial_text(path)
    reduction = (1 - len(c) / len(s)) * 100 if len(s) > 0 else 0
    print(f"{fname:<50} {len(s):>8} {len(c):>10} {reduction:>9.0f}%")

## Table Interpretation

`interpret_table()` takes compressed text and a **canonical schema** describing the columns your application expects, then uses an LLM pipeline to extract structured records.

The schema maps inconsistent PDF column names (e.g. "Ship Name", "Vessel", "Vessel Name") to stable canonical names via aliases. Two modes are available:

- **2-step** (`interpret_table`) — parse table structure first, then map to schema. Step 2 is **batched**: each page's parsed rows are split into chunks (default 20 rows) so the LLM produces complete output without truncation. All batches across all pages run concurrently.
- **Single-shot** (`interpret_table_single_shot`) — one LLM call per page. Faster for simple flat tables, but cannot batch and may truncate on dense pages (50+ rows).

Both modes **auto-split** multi-page input (pages joined by `\f`) and process all pages **concurrently** via `asyncio.gather()`, then merge results into a single `MappedTable`. Single-page input with few rows is processed synchronously with no event loop overhead.

Each output record carries a `_page` extra field (1-indexed) indicating which source page it originated from.

In [None]:
from pdf_ocr import (
    compress_spatial_text,
    interpret_table,
    interpret_table_single_shot,
    CanonicalSchema,
    ColumnDef,
    to_records,
    to_records_by_page,
)

# Define the canonical schema as a plain dict (e.g. loaded from a JSON file).
# CanonicalSchema.from_dict() converts it into the typed dataclass.
#
# Note: "port" has no aliases — it will be inferred from context (section headers,
# document title, or repeated contextual values) rather than matched to a column name.
schema_dict = {
    "description": "Shipping stem vessel loading records",
    "columns": [
        {"name": "port",            "type": "string", "description": "Loading port name — may appear as a section header or document-level context rather than a table column", "aliases": []},
        {"name": "vessel_name",     "type": "string", "description": "Name of the vessel",             "aliases": ["Ship Name", "Vessel"]},
        {"name": "ref",             "type": "string", "description": "Reference number",               "aliases": ["Ref #", "Reference"]},
        {"name": "exporter",        "type": "string", "description": "Exporting company",              "aliases": ["Exporter"]},
        {"name": "commodity",       "type": "string", "description": "Type of commodity",              "aliases": ["Commodity"]},
        {"name": "quantity_tonnes", "type": "int",    "description": "Quantity in metric tonnes",      "aliases": ["Quantity", "Quantity(tonnes)", "Total"]},
        {"name": "eta",             "type": "string", "description": "Estimated time of arrival",      "aliases": ["ETA", "Date ETA of Ship"]},
        {"name": "etb",             "type": "string", "description": "Estimated time of berthing",     "aliases": ["ETB"]},
        {"name": "ets",             "type": "string", "description": "Estimated time of sailing",      "aliases": ["ETS"]},
        {"name": "status",          "type": "string", "description": "Loading status",                 "aliases": ["Status", "Load Status"]},
    ],
}

schema = CanonicalSchema.from_dict(schema_dict)

print(f"Schema: {schema.description}")
print(f"Columns ({len(schema.columns)}):")
for col in schema.columns:
    print(f"  {col.name:20s}  {col.type:6s}  aliases={col.aliases}")

## Multi-page auto-split with batching

`compress_spatial_text()` joins pages with `\f` (form-feed). When `interpret_table()` receives multi-page input, it splits on `\f` and processes all pages **concurrently**.

Step 2 (schema mapping) is **batched** — each page's parsed rows are split into chunks of `batch_size` rows (default 20) before calling the LLM. This prevents truncation on dense pages with many data rows. All batches across all pages run concurrently via `asyncio.gather()`, then results are merged into a single `MappedTable`.

Each record in the output carries a `_page` field (1-indexed) so you can trace which source page it came from.

Below we run the full pipeline on `shipping-stem-2025-11-13.pdf` (3 pages, 180+ records) — no manual splitting needed.

In [None]:
# Multi-page PDF: 3 pages, 180+ records.
# interpret_table() auto-splits on \f, batches step 2 (~20 rows per LLM call),
# and runs all batches concurrently.

compressed_mp = compress_spatial_text("inputs/shipping-stem-2025-11-13.pdf")
pages = [p for p in compressed_mp.split("\f") if p.strip()]
print(f"Pages detected: {len(pages)}")
print(f"Total compressed chars: {len(compressed_mp)}")

result_mp = interpret_table(compressed_mp, schema, model="openai/gpt-4o")

# to_records() returns schema-clean dicts (no internal metadata like _page)
records_mp = to_records(result_mp)
print(f"\nRecords extracted:  {len(records_mp)}")
print(f"Unmapped columns:   {result_mp.unmapped_columns}")
print(f"Sections detected:  {result_mp.metadata.sections_detected}")
if result_mp.mapping_notes:
    print(f"Mapping notes:      {result_mp.mapping_notes}")

# to_records_by_page() groups records by source page (1-indexed)
by_page = to_records_by_page(result_mp)
print(f"\nRecords per page:   { {p: len(recs) for p, recs in sorted(by_page.items())} }")

print("\n--- First 5 records (page 1) ---\n")
for i, rec in enumerate(by_page.get(1, [])[:5], 1):
    print(f"[{i}] {rec}")

print("\n--- First 5 records (page 2) ---\n")
for i, rec in enumerate(by_page.get(2, [])[:5], 1):
    print(f"[{i}] {rec}")

In [6]:
result_mp.model_dump()


{'records': [{'port': 'Newcastle',
   'vessel_name': 'ADAGIO',
   'ref': 'NT25084',
   'exporter': 'ARROW COMMODITIES',
   'commodity': 'Wheat',
   'quantity_tonnes': 26914,
   'eta': '2025-07-10',
   'etb': '2025-07-10',
   'ets': '2025-08-06',
   'status': 'Completed',
   '_page': 1},
  {'port': 'Newcastle',
   'vessel_name': 'QC ISABELLA',
   'ref': 'NT25082',
   'exporter': 'QUBE GRAINS',
   'commodity': 'Wheat',
   'quantity_tonnes': 20333,
   'eta': '2025-07-10',
   'etb': '2025-07-10',
   'ets': '2025-08-17',
   'status': 'Completed',
   '_page': 1},
  {'port': 'Newcastle',
   'vessel_name': 'BRIGHTEN TRADER',
   'ref': 'NT25085',
   'exporter': 'ARROW COMMODITIES',
   'commodity': 'Wheat',
   'quantity_tonnes': 33000,
   'eta': '2025-08-01',
   'etb': '2025-08-04',
   'ets': '2024-08-20',
   'status': 'Completed',
   '_page': 1},
  {'port': 'Newcastle',
   'vessel_name': 'ARUNA NAZIK',
   'ref': 'NT25083',
   'exporter': 'CHS BROADBENT',
   'commodity': 'Wheat',
   'quantity_to