In [None]:
%pip install python-dotenv azure.identity azure-ai-inference

In [9]:
import json
from typing import Any
import json
import base64
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import (
    SystemMessage,
    UserMessage,
    TextContentItem,
    ImageContentItem,
    ImageUrl,
    ImageDetailLevel,
)
from azure.core.credentials import AzureKeyCredential
import urllib

In [None]:
def encode_file(image_path):
    """Encode the image to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    except FileNotFoundError:
        print(f"Error: The file {image_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None


def ocr_analysis(local_path: str, url: str, ocr_api_key: str) -> Any:
    base64_string = encode_file(local_path)

    if local_path.split(".")[-1] == "pdf":
        data = {
            "model": "mistral-ocr-latest",  # Replace with the appropriate model name
            "include_image_base64": "true",
            "document": {
                "type": "document_url",
                "document_url": f"data:image/jpeg;base64,{base64_string}",
            },
        }
    else:
        data = {
            "model": "mistral-ocr-latest",  # Replace with the appropriate model name,
            "include_image_base64": "true",
            "document": {
                "type": "image_url",
                "image_url": f"data:image/jpeg;base64,{base64_string}",
            },
        }
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": ("Bearer " + ocr_api_key),
    }
    body = str.encode(json.dumps(data))
    req = urllib.request.Request(url, body, headers)

    try:
        response = urllib.request.urlopen(req)
        result = response.read().decode("utf-8")  # Decode the response to a string
        json_result = json.loads(result)  # Parse the string into a JSON object
        return json_result
        # print(json.dumps(json_result, indent=4))  # Pretty-print the JSON object
    except urllib.error.HTTPError as error:
        print("The request failed with status code: " + str(error.code))
        # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
        print(error.info())
        print(error.read().decode("utf8", "ignore"))
    except json.JSONDecodeError:
        print("Failed to decode JSON from the response.")

OCR output

In [None]:
url = "https://you-ocr-deployment-name.region.models.ai.azure.com/v1/ocr"  # copy and paste this from your deployment
# Path to your image
image_path = "/path-to-your-image/image.png"
# Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
pdf_path = "/path-too-your-pdf/sample.pdf"
ocr_api_key = "api-key-for-ocr-deployment"
ocr_output = ocr_analysis(image_path, url, ocr_api_key)
ocr_output

In [None]:
ocr_markdown = ocr_output["pages"][0]["markdown"]
ocr_markdown

### Pairing this with a vision language model

you can extract markdown from any page/ image you've done OCR on and pass this markdown along with the image itself for further analysis. You can get this output as structured output. For example, we pass the first page from the above document to a general purpose vision language model as follows: 

In [None]:
vlm_endpoint = "AZURE_INFERENCE_SDK_ENDPOINT"  # something like 'https://xxxxxxxx.services.ai.azure.com/models'
vlm_api_key = "API key for Vision Language Model deployment"
local_path = "/path-to-your-image/image.png"


def vlm_augmentation(
    local_path: str, vlm_endpoint: str, vlm_api_key: str, ocr_markdown: str
) -> str:

    model_deployment = "mistral-small-2503"  # or other specific mistral VLM you prefer

    client = ChatCompletionsClient(
        endpoint=vlm_endpoint,
        credential=AzureKeyCredential(vlm_api_key),
        headers={"azureml-model-deployment": model_deployment},
    )

    response = client.complete(
        messages=[
            SystemMessage(
                "You are an AI assistant that describes images in details. You will be given the markdown representation of the text in the image to provide context for your analysis"
            ),
            UserMessage(
                [
                    TextContentItem(
                        text=f"This is image with OCR output markdown:\n\n{ocr_markdown}\n.\nAnalyze this image and provide the following in JSON format. For each image: 1) A concise description 2) Main objects/elements present 3) Any text visible in the image 4) Estimated image type (photo, diagram, chart, etc.) 5) If it's a plot, infographic or chart provide the data points and the type of plot, infographic or chart. The output should be strictly be json with no extra commentary"
                    ),
                    ImageContentItem(
                        image_url=ImageUrl.load(
                            image_file=local_path,
                            image_format="png",
                            detail=ImageDetailLevel.HIGH,
                        ),
                    ),
                ],
            ),
        ],
        model=model_deployment,
    )
    json_string = response.choices[0].message.content
    # Remove the markdown code block markers if they exist
    if json_string.startswith("```json"):
        json_string = json_string[7:]  # Remove ```json
    if json_string.endswith("```"):
        json_string = json_string[:-3]  # Remove ```

    # Parse the JSON string into a Python object
    json_object = json.loads(json_string)
    return json_object

In [None]:
output = vlm_augmentation(local_path, vlm_endpoint, vlm_api_key, ocr_markdown)
print(json.dumps(output, indent=4))