## Setup

In [None]:
import sys
sys.path.append('../../modules/') # Import local modules

from IPython.display import display, Markdown
import os
import pandas as pd
from dotenv import dotenv_values
import base64
import io
import json
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentContentFormat
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_bytes

from samples.app_settings import AppSettings
from samples.utils.stopwatch import Stopwatch
from samples.utils.storage_utils import create_json_file
from samples.models.document_processing_result import DataExtractionResult

from invoice import Invoice
from classification import Classifications
from samples.confidence.confidence_utils import merge_confidence_values
from samples.confidence.openai_confidence import evaluate_confidence as evaluate_openai_confidence
from samples.confidence.document_intelligence_confidence import evaluate_confidence as evaluate_di_confidence
from samples.evaluation.accuracy_evaluator import AccuracyEvaluator
from samples.evaluation.comparison import get_extraction_comparison

In [None]:
# Set the working directory to the root of the repo
working_dir = os.path.abspath('../../../../')
settings = AppSettings(dotenv_values(f"{working_dir}/.env"))
sample_path = f"{working_dir}/samples/python/demo/invoices/"
sample_name = "invoice-extraction"

# Configure the default credential for accessing Azure services using Azure CLI credentials
credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=settings.openai_endpoint,
    azure_ad_token_provider=openai_token_provider,
    api_version="2024-12-01-preview" # Requires the latest API version for structured outputs.
)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=settings.ai_services_endpoint,
    credential=credential
)

In [None]:
path = sample_path
metadata_fname = "invoice.json" # Change this to the file you want to evaluate
metadata_fpath = f"{path}{metadata_fname}"

with open(metadata_fpath, 'r') as f:
    metadata = json.load(f)

expected_outputs = [Invoice(**metadata[key]) for key in metadata.keys() if key.endswith('_expected')]

pdf_fname = metadata['fname']
pdf_fpath = f"{path}{pdf_fname}"

use_document_intelligence = True

invoice_evaluator = AccuracyEvaluator(match_keys=['product_code', 'description'])

In [None]:
page_images = []

def encode_page(page):
    byte_io = io.BytesIO()
    page.save(byte_io, format='PNG')
    base64_data = base64.b64encode(byte_io.getvalue()).decode('utf-8')
    return {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/png;base64,{base64_data}"
        }
    }

with Stopwatch() as image_stopwatch:
    with open(pdf_fpath, "rb") as f:
        document_bytes = f.read()

    pages = convert_from_bytes(document_bytes)
    
    # Process each page in parallel using multiple processes
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(encode_page, pages))
        page_images.extend(results)

## Classify

In [None]:
classifications = [
    {
        "classification": "Invoice/Credit Note",
        "description": "A document that serves as a bill for goods or services provided, often used for payment processing and record-keeping.",
    }
]

In [None]:
classify_system_prompt = f"""You are an AI assistant that helps detect the boundaries of sub-section or sub-documents using the provided classifications.

## Classifications

{json.dumps(classifications)}
"""

In [None]:
classify_user_content = []

In [None]:
classify_user_prompt = f"""Classify documents that are an invoice/credit note in the provided page images.
- A single classification may span multiple page images.
- A single page image may contain multiple classifications.
- If a page image does not contain a classification, ignore it."""

classify_user_content.append({
    "type": "text",
    "text": classify_user_prompt
})

# for each of the page images, add a text content block with the page index, and then another block with the image
for i, page_image in enumerate(page_images):
    classify_user_content.append({
        "type": "text",
        "text": f"Page {i + 1}:"
    })
    classify_user_content.append(page_image)

In [None]:
with Stopwatch() as classify_stopwatch:
    classify_completion = openai_client.beta.chat.completions.parse(
        model=settings.gpt4o_model_deployment_name,
        messages=[
            {
                "role": "system",
                "content": classify_system_prompt,
            },
            {
                "role": "user",
                "content": classify_user_content
            }
        ],
        response_format=Classifications,
        max_tokens=4096,
        temperature=0.1,
        top_p=0.1,
        logprobs=True # Enabled to determine the confidence of the response.
    )

In [None]:
document_classifications = classify_completion.choices[0].message.parsed

In [None]:
# DEBUG: Display the document classifications
print(document_classifications.model_dump_json(indent=2))

In [None]:
# For each document classification, group the page images together
classified_images = []
classified_images_pil = []

for classification in document_classifications.classifications:
    image_range_start = classification.image_range_start
    image_range_end = classification.image_range_end

    classified_images.append(page_images[image_range_start - 1:image_range_end])
    classified_images_pil.append(pages[image_range_start - 1:image_range_end])

## Extract

In [None]:
for i, images in enumerate(classified_images):
    print(f"Classification {i}")
    
    images_pil = classified_images_pil[i]
    
    pdf_bytes = io.BytesIO()
    images_pil[0].save(pdf_bytes, format='PDF', resolution=100.0, save_all=True, append_images=images_pil[1:])
    pdf_bytes.seek(0)

    with Stopwatch() as di_stopwatch:
        if use_document_intelligence:
            poller = document_intelligence_client.begin_analyze_document(
                model_id="prebuilt-layout",
                body=pdf_bytes,
                output_content_format=DocumentContentFormat.MARKDOWN,
                content_type="application/pdf"
            )
            
            result: AnalyzeResult = poller.result()

            markdown = result.content
            
    # DEBUG: Display the document intelligence markdown content
    display(Markdown(markdown))
            
    extract_system_prompt = f"""You are an AI assistant that extracts data from documents."""

    # Prepare the user content for the OpenAI API including any specific details for processing this type of document, text, and the document page images.
    extract_user_content = []

    extract_user_prompt = """Extract the data from this invoice. 
    - If a value is not present, provide null.
    - It is possible that there are multiple invoices in the same document across multiple pages.
    - Some values must be inferred based on the content defined in the invoice.
    - Dates should be in the format YYYY-MM-DD."""

    extract_user_content.append({
        "type": "text",
        "text": extract_user_prompt
    })

    if use_document_intelligence:
        extract_user_content.append({
            "type": "text",
            "text": markdown
        })
        
    extract_user_content.extend(images)

    with Stopwatch() as oai_stopwatch:
        completion = openai_client.beta.chat.completions.parse(
            model=settings.gpt4o_model_deployment_name,
            messages=[
                {
                    "role": "system",
                    "content": extract_system_prompt,
                },
                {
                    "role": "user",
                    "content": extract_user_content
                }
            ],
            response_format=Invoice,
            max_tokens=4096,
            temperature=0.1,
            top_p=0.1,
            logprobs=True # Enabled to determine the confidence of the response.
        )
        
    # Gets the parsed Invoice object from the completion response.
    invoice = completion.choices[0].message.parsed

    expected_dict = expected_outputs[i].model_dump() if i < len(expected_outputs) else None
    invoice_dict = invoice.model_dump()
    
    if expected_dict is not None:
        accuracy = invoice_evaluator.evaluate(expected=expected_dict, actual=invoice_dict)
    else:
        accuracy = None

    # Determines the confidence of the extracted data.
    oai_confidence = evaluate_openai_confidence(invoice_dict, completion.choices[0])

    if use_document_intelligence:
        di_confidence = evaluate_di_confidence(invoice_dict, result)
        confidence = merge_confidence_values(di_confidence, oai_confidence)
    else:
        confidence = oai_confidence
        
    # DEBUG: Gets the total execution time of the data extraction process.
    total_elapsed = di_stopwatch.elapsed + image_stopwatch.elapsed + oai_stopwatch.elapsed

    # DEBUG: Gets the prompt tokens and completion tokens from the completion response.
    prompt_tokens = completion.usage.prompt_tokens
    completion_tokens = completion.usage.completion_tokens

    # DEBUG: Save the output of the data extraction result.
    extraction_result = DataExtractionResult(invoice_dict, confidence, accuracy, prompt_tokens, completion_tokens, total_elapsed)
    create_json_file(f"{sample_path}/{sample_name}.{pdf_fname}_{i}.json", extraction_result)
    
    display_accuracy = f"{accuracy['overall'] * 100:.2f}%" if accuracy is not None else "N/A"

    # DEBUG: Display the outputs of the data extraction process.
    df = pd.DataFrame([
        {
            "Document": pdf_fname,
            "Index": i,
            "Accuracy": display_accuracy,
            "Confidence": f"{confidence['_overall'] * 100:.2f}%",
            "Execution Time": f"{total_elapsed:.2f} seconds",
            "Document Intelligence Execution Time": f"{di_stopwatch.elapsed:.2f} seconds",
            "Image Pre-processing Execution Time": f"{image_stopwatch.elapsed:.2f} seconds",
            "OpenAI Execution Time": f"{oai_stopwatch.elapsed:.2f} seconds",
            "Prompt Tokens": prompt_tokens,
            "Completion Tokens": completion_tokens
        }
    ])

    display(df)
    
    if accuracy is not None:
        display(get_extraction_comparison(expected_dict, invoice_dict, confidence, accuracy['accuracy']))