## Setup

### Imports

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv
from loguru import logger
from mistralai import Mistral

### Config

In [None]:
load_dotenv()

mistral_api_key = os.getenv("MISTRAL_API_KEY")
if not mistral_api_key:
    raise ValueError("Mistral api key not present in .env")

mistral_client = Mistral(api_key=mistral_api_key)

party = "50PLUS"

## Process PDF with Mistral OCR

### Upload PDF to Mistral

In [None]:
filename = f"Verkiezingsprogramma {party}.pdf"
file = Path.cwd().parent / "data" / "pdfs" / filename

if not file.exists():
    raise ValueError(f"The file {file} does not exist.")

logger.info(f"Uploading {filename} to Mistral...")
uploaded_pdf = mistral_client.files.upload(
    file={
        "file_name": filename,
        "content": open(file, "rb"),
    },
    purpose="ocr"
)
document_url = mistral_client.files.get_signed_url(file_id=uploaded_pdf.id)

### Process uploaded document

In [None]:
logger.info(f"Running OCR on document {document_url}...")
ocr_result = mistral_client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": document_url.url,
    },
    include_image_base64=False
)

### Extract markdown from results

In [None]:
response_markdown = '\n\n'.join([page.markdown for page in ocr_result.pages])
print(response_markdown)