In [None]:
# Setup: imports, client, and constants (Basic OCR)
import os
import json
import base64
from pathlib import Path
from dotenv import load_dotenv
from mistralai import Mistral

# Load environment and init client
load_dotenv()
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

# Paths and model
NOTEBOOK_DIR = Path.cwd()
PDF_PATH = Path("/Users/Focus/Documents/Codebases/papersummarizer/testscripts/2507.18071v2.pdf")
MODEL = "mistral-ocr-latest"

print(f"Using PDF: {PDF_PATH}")
assert PDF_PATH.exists(), "PDF not found. Update PDF_PATH if needed."


In [None]:
# Run Basic OCR; request image crops
with open(PDF_PATH, "rb") as f:
    pdf_bytes = f.read()

data_url = "data:application/pdf;base64," + base64.b64encode(pdf_bytes).decode("utf-8")

ocr_response = client.ocr.process(
    model=MODEL,
    document={
        "type": "document_url",
        "document_url": data_url,
    },
    include_image_base64=True,
)

ocr_response


In [None]:
# Print per-page markdown output
for page in ocr_response.pages:
    print(f"\n## Page {page.index}")
    print(page.markdown)


In [None]:
# Print bbox coordinates per page
pages = getattr(ocr_response, "pages", []) or []
for page in pages:
    dims = getattr(page, "dimensions", None)
    print(f"\n=== Page {page.index} dimensions: {getattr(dims, 'width', None)}x{getattr(dims, 'height', None)} (dpi={getattr(dims, 'dpi', None)}) ===")
    images = getattr(page, "images", []) or []
    if not images:
        print("(no image bboxes)")
    for i, img in enumerate(images, start=1):
        tlx = getattr(img, "top_left_x", None)
        tly = getattr(img, "top_left_y", None)
        brx = getattr(img, "bottom_right_x", None)
        bry = getattr(img, "bottom_right_y", None)
        width = (brx - tlx) if (tlx is not None and brx is not None) else None
        height = (bry - tly) if (tly is not None and bry is not None) else None
        print(f"- Image {i}: top_left=({tlx},{tly}) bottom_right=({brx},{bry}) size=({width}x{height}) id={getattr(img,'id',None)}")


In [None]:
# Display cropped images for each bbox inline
from IPython.display import display
from PIL import Image as PILImage
import io

for page in ocr_response.pages:
    print(f"\n=== Cropped images for Page {page.index} ===")
    for i, img in enumerate(getattr(page, "images", []) or [], start=1):
        data_str = getattr(img, "image_base64", None)
        if not data_str:
            print(f"- Image {i}: <no image_base64>")
            continue
        try:
            if data_str.startswith("data:"):
                _, b64_data = data_str.split(",", 1)
            else:
                b64_data = data_str
            image_bytes = base64.b64decode(b64_data)
            pil_img = PILImage.open(io.BytesIO(image_bytes))
            print(f"- Image {i}, size={pil_img.size}")
            display(pil_img)
        except Exception as e:
            print(f"- Image {i}: failed to decode/display ({e})")
