## Combined Mistral OCR + Annotations (Single Pass, Chunked)

This notebook replicates the clean notebook functionality using only the annotations API (with bbox + document annotations) in ≤8-page chunks, then merges results.

Outputs:
- Per-page markdown
- Image bboxes + crops + bbox annotations
- Document annotations (language, title, authors, chapter_titles, urls, outline)
- Headings from markdown and aligned outline with page/line

References: [Basic OCR](https://docs.mistral.ai/capabilities/document_ai/basic_ocr/), [Annotations](https://docs.mistral.ai/capabilities/document_ai/annotations/)


### 1) Setup
Configure client, constants, and convert the PDF to a data URL for simple passing.


In [None]:
"""
Setup minimal constants and Mistral client.
"""

# ### CONSTANTS ###
from pathlib import Path
NOTEBOOK_NAME: str = "2025.09.08-test_combined_mistral_ocr_annotations"
PDF_PATH: Path = Path("/Users/Focus/Downloads/2212.14024v2.pdf")
MODEL: str = "mistral-ocr-latest"
DOC_ANNOTATION_MAX_PAGES: int = 8  # per docs

# ### DEPENDENCIES ###
import os
import base64
from dotenv import load_dotenv
from mistralai import Mistral

# ### CLIENT ###
load_dotenv()
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
    raise RuntimeError("MISTRAL_API_KEY not set.")
client = Mistral(api_key=api_key)

if not PDF_PATH.exists():
    raise FileNotFoundError(f"PDF not found: {PDF_PATH}")

with open(PDF_PATH, "rb") as f:
    pdf_bytes = f.read()
DOCUMENT_SPEC = {
    "type": "document_url",
    "document_url": "data:application/pdf;base64," + base64.b64encode(pdf_bytes).decode("utf-8"),
}

print("Setup complete.")


### 2) Define Schemas
Describe bbox annotation and document annotation formats (document includes outline so no extra OCR call is needed).


In [None]:
"""
Define simple Pydantic schemas for annotations and outline.
"""

from pydantic import BaseModel, Field
from typing import List
from mistralai.extra import response_format_from_pydantic_model

class BBoxImageAnnotation(BaseModel):
    image_type: str = Field(..., description="Type of the image (plot/table/diagram/etc)")
    short_description: str = Field(..., description="Short description in English")
    summary: str = Field(..., description="Longer summary of the image contents")

class OutlineItem(BaseModel):
    title: str = Field(..., description="Heading text. Only include clear headers dividng the text in sections. These are typically recognized by numbering, and are often in a larger font / in bold.")
    level: int = Field(..., description="Heading level 1..6")

class DocumentAnnotation(BaseModel):
    language: str = Field(..., description="Language of the document")
    title: str | None = Field(None, description="Document title if present")
    authors: list[str] = Field(..., description="Author names")
    chapter_titles: list[str] = Field(..., description="Chapter titles in order")
    urls: list[str] = Field(..., description="URLs referenced in the document")
    outline: List[OutlineItem] = Field(default_factory=list, description="Document outline (level + title)")

bbox_rf = response_format_from_pydantic_model(BBoxImageAnnotation)
doc_rf = response_format_from_pydantic_model(DocumentAnnotation)

print("Schemas ready.")


### 3) Single Chunked OCR + Annotations Pass
Request both bbox annotations and document annotations in each batch, merge pages and document fields (one pass).


In [None]:
"""
Run chunked OCR+Annotations and produce two variables:
- ocr: container with .pages (markdown + images with bbox + crops)
- combined: merged doc-level fields (language/title/authors/chapters/urls/outline)
"""

from typing import List, Dict, Any
import json as _json

# Merge helper

def merge_doc_annotations(parts: List[Dict[str, Any]]) -> Dict[str, Any]:
    result: Dict[str, Any] = {
        "language": None,
        "title": None,
        "authors": [],
        "chapter_titles": [],
        "urls": [],
        "outline": [],
    }
    seen_authors = set(); seen_chapters = set(); seen_urls = set()
    for p in parts:
        if not result["language"] and p.get("language"):
            result["language"] = p.get("language")
        if not result["title"] and p.get("title"):
            result["title"] = p.get("title")
        for a in p.get("authors", []) or []:
            if a not in seen_authors:
                seen_authors.add(a); result["authors"].append(a)
        for c in p.get("chapter_titles", []) or []:
            if c not in seen_chapters:
                seen_chapters.add(c); result["chapter_titles"].append(c)
        for u in p.get("urls", []) or []:
            if u not in seen_urls:
                seen_urls.add(u); result["urls"].append(u)
        for o in p.get("outline", []) or []:
            result["outline"].append(o)
    return result

collected_pages = []
annotation_parts: List[Dict[str, Any]] = []

# First batch
batch = list(range(DOC_ANNOTATION_MAX_PAGES))
resp = client.ocr.process(
    model=MODEL,
    document=DOCUMENT_SPEC,
    pages=batch,
    bbox_annotation_format=bbox_rf,
    document_annotation_format=doc_rf,
    include_image_base64=True,
)
if not getattr(resp, "pages", None):
    raise RuntimeError("First OCR batch returned no pages.")
collected_pages.extend(resp.pages)
raw = resp.document_annotation
annotation_parts.append(_json.loads(raw) if isinstance(raw, str) else (raw.model_dump() if hasattr(raw, "model_dump") else raw))

# Subsequent batches
start = DOC_ANNOTATION_MAX_PAGES
while True:
    batch = list(range(start, start + DOC_ANNOTATION_MAX_PAGES))
    resp = client.ocr.process(
        model=MODEL,
        document=DOCUMENT_SPEC,
        pages=batch,
        bbox_annotation_format=bbox_rf,
        document_annotation_format=doc_rf,
        include_image_base64=True,
    )
    pages_batch = getattr(resp, "pages", None) or []
    if not pages_batch:
        break
    collected_pages.extend(pages_batch)
    raw = resp.document_annotation
    annotation_parts.append(_json.loads(raw) if isinstance(raw, str) else (raw.model_dump() if hasattr(raw, "model_dump") else raw))
    start += DOC_ANNOTATION_MAX_PAGES

class _OCR:
    def __init__(self, pages):
        self.pages = pages

ocr = _OCR(collected_pages)
combined = merge_doc_annotations(annotation_parts)

print("Pages (len):", len(ocr.pages))
print("Combined keys:", list(combined.keys()))


### 4) Per-page markdown and image crops + bbox coordinates
Replicates the clean notebook output: prints markdown, bbox coordinates, and displays cropped images.


In [None]:
"""
Print per-page markdown and bbox coordinates; display cropped images.
"""

from IPython.display import display
from PIL import Image as PILImage
import io
import base64 as _b64

print("Pages (len):", len(ocr.pages))
for page in ocr.pages:
    dims = getattr(page, "dimensions", None)
    print(f"\n## Page {page.index} | dims: {getattr(dims,'width',None)}x{getattr(dims,'height',None)} dpi={getattr(dims,'dpi',None)}")
    print(getattr(page, "markdown", "") or "")

    images = getattr(page, "images", []) or []
    if not images:
        print("(no image bboxes)")
    for i, img in enumerate(images, start=1):
        tlx = img.top_left_x; tly = img.top_left_y
        brx = img.bottom_right_x; bry = img.bottom_right_y
        w = brx - tlx; h = bry - tly
        print(f"- Image {i}: id={getattr(img,'id',None)} bbox=({tlx},{tly})→({brx},{bry}) size=({w}x{h})")

        data_str = getattr(img, "image_base64", None)
        if not data_str:
            continue
        b64 = data_str.split(",", 1)[1] if data_str.startswith("data:") else data_str
        image_bytes = _b64.b64decode(b64)
        pil_img = PILImage.open(io.BytesIO(image_bytes))
        display(pil_img)


### 5) Document annotation summary
Print language, title, authors, chapter titles, URLs.


In [None]:
"""
Print merged document-level fields.
"""

import json as _json

print("language:", combined.get("language"))
print("title:", combined.get("title"))
print("authors:")
for a in combined.get("authors", []):
    print(" -", a)
print("chapter_titles:")
for t in combined.get("chapter_titles", []):
    print(" -", t)
print("urls:")
for u in combined.get("urls", []):
    print(" -", u)


### 6) Headings from Markdown (by page)
Extract headings (ATX + setext) with page index and line number.


In [None]:
"""
Parse markdown headings from OCR pages.
"""

import re
from typing import List, Dict, Any

markdown_headings: List[Dict[str, Any]] = []
for page in ocr.pages:
    lines = (getattr(page, "markdown", "") or "").splitlines()

    # ATX (#..######)
    for i, line in enumerate(lines, start=1):
        m = re.match(r"^(#{1,6})\s+(.*)$", line)
        if m:
            markdown_headings.append({
                "page_index": page.index,
                "line": i,
                "level": len(m.group(1)),
                "text": m.group(2).strip(),
            })

    # Setext (=== or --- underline)
    for i in range(2, len(lines) + 1):
        underline = lines[i - 1].strip()
        if re.match(r"^={3,}$", underline):
            markdown_headings.append({
                "page_index": page.index,
                "line": i - 1,
                "level": 1,
                "text": lines[i - 2].strip(),
            })
        elif re.match(r"^-{3,}$", underline):
            markdown_headings.append({
                "page_index": page.index,
                "line": i - 1,
                "level": 2,
                "text": lines[i - 2].strip(),
            })

print(f"Found {len(markdown_headings)} markdown headings")


### 7) Align Outline to Markdown (RapidFuzz) + Print
Align combined outline (level + title) to markdown headings, print readable mapping and save JSON.


In [None]:
"""
Align outline with markdown headings using RapidFuzz; print and save.
"""

from typing import List, Dict, Any
from rapidfuzz import fuzz
import json as _json
from pathlib import Path as _Path

def _norm(text: str) -> str:
    return " ".join((text or "").lower().split())

ocr_outline = combined.get("outline", []) or []
all_md: List[Dict[str, Any]] = list(markdown_headings)

aligned: List[Dict[str, Any]] = []
THRESHOLD: int = 85

for item in ocr_outline:
    title = str(item.get("title") or "")
    level = int(item.get("level"))

    if not all_md:
        raise RuntimeError("No markdown headings to align to.")

    query = _norm(title)
    best_score = -1
    best_idx = None
    for idx, md_h in enumerate(all_md):
        score = fuzz.token_set_ratio(query, _norm(md_h["text"]))
        if score > best_score:
            best_score = score
            best_idx = idx

    rec: Dict[str, Any] = {
        "ocr_title": title,
        "ocr_level": level,
        "markdown_page_index": None,
        "markdown_line": None,
        "markdown_title": None,
        "score": int(best_score) if best_score >= 0 else None,
    }

    if best_idx is not None and best_score >= THRESHOLD:
        md_h = all_md[best_idx]
        rec["markdown_page_index"] = md_h["page_index"]
        rec["markdown_line"] = md_h["line"]
        rec["markdown_title"] = md_h["text"]

    aligned.append(rec)

print("\nAligned outline with markdown page/line:")
for a in aligned:
    print(f"[page {a.get('markdown_page_index')} line {a.get('markdown_line')}] h{a.get('ocr_level')}: {a.get('ocr_title')} (score={a.get('score')})")

# Save outputs with filename prefix
out_dir = _Path.cwd() / "outputs"
out_dir.mkdir(exist_ok=True)

headers_index_file = out_dir / f"{NOTEBOOK_NAME}_headers_index.json"
headers_normalized_file = out_dir / f"{NOTEBOOK_NAME}_headers_index_normalized.json"

with open(headers_index_file, "w", encoding="utf-8") as f:
    _json.dump(markdown_headings, f, ensure_ascii=False, indent=2)
with open(headers_normalized_file, "w", encoding="utf-8") as f:
    _json.dump(aligned, f, ensure_ascii=False, indent=2)
print(f"Saved {headers_index_file} and {headers_normalized_file}")
