## Imports

In [None]:
import os

from dotenv import load_dotenv
from loguru import logger
from mistralai import Mistral

from src.config import FilePaths, MistralConfig
from src.enums import Party

## Config

In [4]:
load_dotenv()

mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

party = Party.VIJFTIG_PLUS

## Upload PDF to Mistral

In [5]:
filename = f"Verkiezingsprogramma {party}.pdf"
file = FilePaths.pdf_dir / filename

if not file.exists():
    raise ValueError(f"The file {file} does not exist.")

logger.info(f"Uploading {filename} to Mistral...")
uploaded_pdf = mistral_client.files.upload(
    file={
        "file_name": filename,
        "content": open(file, "rb"),
    },
    purpose="ocr"
)
document_url = mistral_client.files.get_signed_url(file_id=uploaded_pdf.id)

[32m2025-09-21 22:26:25.610[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mUploading Verkiezingsprogramma 50PLUS.pdf to Mistral...[0m


In [None]:
logger.info(f"Running OCR on document {document_url}...")
ocr_result = mistral_client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": document_url.url,
    },
    include_image_base64=False
)

[32m2025-09-21 22:27:02.844[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mRunning OCR on document url='https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/f8cb6264-0e00-4357-9f2d-687713d1ae0d/b858bf61-30f8-4434-8417-5fbc3e5bf35c/eaa07f4fe4e84f8cbd5983a26f8782cd.pdf?se=2025-09-22T20%3A26%3A26Z&sp=r&sv=2025-01-05&sr=b&sig=zI3zxV3VT2DsmpmINHkgtngt/S/8nk6BYhUM8hNFgkA%3D'...[0m


ValidationError: 5 validation errors for OCRRequest
document.FileChunk.file_id
  Field required [type=missing, input_value={'type': 'document_url', ...S/8nk6BYhUM8hNFgkA%3D')}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
document.FileChunk.type
  Input should be 'file' [type=literal_error, input_value='document_url', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
document.ImageURLChunk.image_url
  Field required [type=missing, input_value={'type': 'document_url', ...S/8nk6BYhUM8hNFgkA%3D')}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
document.ImageURLChunk.type
  Input should be 'image_url' [type=literal_error, input_value='document_url', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
document.DocumentURLChunk.document_url
  Input should be a valid string [type=string_type, input_value=FileSignedURL(url='https:.../S/8nk6BYhUM8hNFgkA%3D'), input_type=FileSignedURL]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type

In [None]:
response_markdown = '\n\n'.join([page.markdown for page in ocr_result.pages])
print(response_markdown)