In [None]:
# Setup: imports, client, and constants
import os
import json
import base64
from pathlib import Path
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model

# Load environment and init client
load_dotenv()
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

# Paths and model
NOTEBOOK_DIR = Path.cwd()
PDF_PATH = Path("/Users/Focus/Documents/Codebases/papersummarizer/testscripts/2507.18071v2.pdf")
MODEL = "mistral-ocr-latest"
# Document annotations have an 8-page limit (per docs); adjust as needed
PAGES = list(range(8))

print(f"Using PDF: {PDF_PATH}")
assert PDF_PATH.exists(), "PDF not found. Update PDF_PATH if needed."


In [None]:
# Define Pydantic schemas for annotations per docs
class BBoxImageAnnotation(BaseModel):
    image_type: str = Field(..., description="The type of the image.")
    short_description: str = Field(..., description="A description in English describing the image.")
    summary: str = Field(..., description="Summarize the image.")

class DocumentAnnotation(BaseModel):
    language: str = Field(..., description="Language of the document")
    chapter_titles: list[str] = Field(..., description="Chapter titles in order")
    urls: list[str] = Field(..., description="URLs referenced in the document")

bbox_rf = response_format_from_pydantic_model(BBoxImageAnnotation)
doc_rf = response_format_from_pydantic_model(DocumentAnnotation)

bbox_rf, doc_rf


In [None]:
# Read PDF as bytes and call OCR with annotations
with open(PDF_PATH, "rb") as f:
    pdf_bytes = f.read()

# Prefer document_bytes if supported by SDK; fallback to data URL
try:
    document_spec = {"type": "document_bytes", "document_bytes": pdf_bytes}
    response = client.ocr.process(
        model=MODEL,
        document=document_spec,
        pages=PAGES,
        bbox_annotation_format=bbox_rf,
        document_annotation_format=doc_rf,
        include_image_base64=True,
    )
except Exception as e:
    # Fallback: embed as data URL
    data_url = "data:application/pdf;base64," + base64.b64encode(pdf_bytes).decode("utf-8")
    document_spec = {"type": "document_url", "document_url": data_url}
    response = client.ocr.process(
        model=MODEL,
        document=document_spec,
        pages=PAGES,
        bbox_annotation_format=bbox_rf,
        document_annotation_format=doc_rf,
        include_image_base64=True,
    )

response


In [None]:
# Print bounding box coordinates and annotations inline
import json as _json

pages = getattr(response, "pages", []) or []
for page in pages:
    page_index = getattr(page, "index", None)
    images = getattr(page, "images", []) or []
    print(f"\n=== Page {page_index} — {len(images)} image bbox(es) ===")
    for i, img in enumerate(images, start=1):
        # Access coordinates from object or dict
        def _get(obj, key, default=None):
            if hasattr(obj, key):
                return getattr(obj, key)
            if isinstance(obj, dict):
                return obj.get(key, default)
            return default
        tlx = _get(img, "top_left_x")
        tly = _get(img, "top_left_y")
        brx = _get(img, "bottom_right_x")
        bry = _get(img, "bottom_right_y")
        width = (brx - tlx) if (tlx is not None and brx is not None) else None
        height = (bry - tly) if (tly is not None and bry is not None) else None
        img_id = _get(img, "id") or _get(img, "name")

        print(f"- Image {i} | id: {img_id}")
        print(f"  bbox: top_left=({tlx}, {tly}), bottom_right=({brx}, {bry}), size=({width} x {height})")

        # Parse bbox annotation if present (stringified JSON in some SDK versions)
        ann = _get(img, "image_annotation")
        if isinstance(ann, str):
            try:
                ann = _json.loads(ann)
            except Exception:
                pass
        if ann:
            print("  annotation:")
            try:
                print(_json.dumps(ann, ensure_ascii=False, indent=2))
            except Exception:
                print(f"    {ann}")
        else:
            print("  annotation: <none>")


In [None]:
# Save raw response JSON and a compact summary
out_dir = NOTEBOOK_DIR
raw_json_path = out_dir / "annotations_raw.json"
summary_path = out_dir / "annotations_summary.json"

# The SDK returns models; convert to dict where possible

def model_to_dict(obj):
    if hasattr(obj, "model_dump"):
        return obj.model_dump()
    if hasattr(obj, "__dict__"):
        return obj.__dict__
    return obj

resp_dict = model_to_dict(response)

with open(raw_json_path, "w", encoding="utf-8") as f:
    json.dump(resp_dict, f, ensure_ascii=False, indent=2)

summary = {
    "num_pages": len(getattr(response, "pages", []) or []),
    "has_bbox_annotations": any(getattr(p, "bboxes", None) for p in getattr(response, "pages", []) or []),
    "has_document_annotation": hasattr(response, "document_annotation") and bool(getattr(response, "document_annotation", None)),
}

with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

raw_json_path, summary_path, summary


In [None]:
# Display cropped bbox images inline
from IPython.display import display
from PIL import Image as PILImage
import io
import re
import base64 as _b64


def _extract_base64_and_ext(data_str: str):
    if not data_str or not isinstance(data_str, str):
        return None, None
    if data_str.startswith("data:"):
        header, b64_data = data_str.split(",", 1)
        mime = header.split(";")[0]
        ext = mime.split("/")[-1] if "/" in mime else "jpeg"
        return b64_data, ext
    return data_str, "jpeg"

pages = getattr(response, "pages", []) or []
for page in pages:
    page_index = getattr(page, "index", None)
    images = getattr(page, "images", []) or []
    print(f"\n=== Cropped images for Page {page_index} ===")
    for i, img in enumerate(images, start=1):
        data_str = getattr(img, "image_base64", None)
        if data_str is None and isinstance(img, dict):
            data_str = img.get("image_base64")
        if not data_str:
            print(f"- Image {i}: <no image_base64>")
            continue
        b64_data, ext = _extract_base64_and_ext(data_str)
        if not b64_data:
            print(f"- Image {i}: <invalid image data>")
            continue
        try:
            image_bytes = _b64.b64decode(b64_data)
            pil_img = PILImage.open(io.BytesIO(image_bytes))
            print(f"- Image {i} ({ext}), size={pil_img.size}")
            display(pil_img)
        except Exception as e:
            print(f"- Image {i}: failed to decode/display ({e})")


### Notes
- This notebook uses both BBox and Document Annotation formats as described in Mistral Docs.
- Document Annotation supports up to 8 pages per request; adjust `PAGES` accordingly.
- Requires environment variable `MISTRAL_API_KEY`.
- Reference: [Mistral Annotations](https://docs.mistral.ai/capabilities/document_ai/annotations/)
