## Setup

In [118]:
import sys
import os
from pathlib import Path
wdir = os.path.abspath('../../../../')
sys.path.append(os.path.join(wdir, 'samples/python/modules/')) # Import local modules

from IPython.display import display, Markdown
import os
import pandas as pd
from dotenv import dotenv_values
import base64
import io
import json
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentContentFormat
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_bytes

from samples.app_settings import AppSettings
from samples.utils.stopwatch import Stopwatch
from samples.utils.storage_utils import create_json_file
from samples.models.document_processing_result import DataExtractionResult

from samples.models.invoice import Invoice
from samples.models.classification import Classifications
from samples.confidence.confidence_utils import merge_confidence_values
from samples.confidence.openai_confidence import evaluate_confidence as evaluate_openai_confidence
from samples.confidence.document_intelligence_confidence import evaluate_confidence as evaluate_di_confidence
from samples.evaluation.accuracy_evaluator import AccuracyEvaluator
from samples.evaluation.comparison import get_extraction_comparison

In [None]:
app_settings = AppSettings(dotenv_values(Path(os.path.join(wdir, '.env'))))
scenario_path = Path(os.path.join(wdir, 'samples/python/scenarios/invoices/'))
scenario_name = "invoice-extraction"

# Configure the default credential for accessing Azure services using Azure CLI credentials
credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=app_settings.azure_openai_endpoint,
    azure_ad_token_provider=openai_token_provider,
    api_version=settings.azure_openai_api_version
)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=app_settings.azure_ai_services_endpoint,
    credential=credential
)

In [120]:
inputs_path = Path(os.path.join(wdir, 'samples/assets/invoices/'))
metadata_fname = "invoice_6.json" # Change this to the file you want to evaluate
metadata_fpath = Path(os.path.join(inputs_path, metadata_fname))

with open(metadata_fpath, 'r') as f:
    metadata = json.load(f)

expected_outputs = [Invoice(**metadata[key]) for key in metadata.keys() if key.endswith('_expected')]

pdf_fname = metadata['fname']
pdf_fpath = Path(os.path.join(inputs_path, pdf_fname))

use_document_intelligence = True

invoice_evaluator = AccuracyEvaluator(match_keys=['product_code', 'description'])

In [121]:
page_images = []

def encode_page(page):
    byte_io = io.BytesIO()
    page.save(byte_io, format='PNG')
    base64_data = base64.b64encode(byte_io.getvalue()).decode('utf-8')
    return {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/png;base64,{base64_data}"
        }
    }

with Stopwatch() as image_stopwatch:
    with open(pdf_fpath, "rb") as f:
        document_bytes = f.read()

    pages = convert_from_bytes(document_bytes)
    
    # Process each page in parallel using multiple processes
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(encode_page, pages))
        page_images.extend(results)

## Classify

In [122]:
classifications = [
    {
        "classification": "Invoice/Credit Note",
        "description": "A document that serves as a bill for goods or services provided, often used for payment processing and record-keeping.",
    }
]

In [123]:
classify_system_prompt = f"""You are an AI assistant that helps detect the boundaries of sub-section or sub-documents using the provided classifications.

## Classifications

{json.dumps(classifications)}
"""

In [124]:
classify_user_content = []

In [125]:
classify_user_prompt = f"""Classify documents that are an invoice/credit note in the provided page images.
- A single classification may span multiple page images.
- A single page image may contain multiple classifications.
- If a page image does not contain a classification, ignore it."""

classify_user_content.append({
    "type": "text",
    "text": classify_user_prompt
})

# for each of the page images, add a text content block with the page index, and then another block with the image
for i, page_image in enumerate(page_images):
    classify_user_content.append({
        "type": "text",
        "text": f"Page {i + 1}:"
    })
    classify_user_content.append(page_image)

In [None]:
with Stopwatch() as classify_stopwatch:
    classify_completion = openai_client.beta.chat.completions.parse(
        model=app_settings.azure_openai_chat_deployment,
        messages=[
            {
                "role": "system",
                "content": classify_system_prompt,
            },
            {
                "role": "user",
                "content": classify_user_content
            }
        ],
        response_format=Classifications,
        max_tokens=4096,
        temperature=0.1,
        top_p=0.1,
        logprobs=True # Enabled to determine the confidence of the response.
    )

In [127]:
document_classifications = classify_completion.choices[0].message.parsed

In [128]:
# DEBUG: Display the document classifications
print(document_classifications.model_dump_json(indent=2))

{
  "classifications": [
    {
      "classification": "Invoice/Credit Note",
      "image_range_start": 1,
      "image_range_end": 2
    }
  ]
}


In [129]:
# For each document classification, group the page images together
classified_images = []
classified_images_pil = []

for classification in document_classifications.classifications:
    image_range_start = classification.image_range_start
    image_range_end = classification.image_range_end

    classified_images.append(page_images[image_range_start - 1:image_range_end])
    classified_images_pil.append(pages[image_range_start - 1:image_range_end])

## Extract

In [None]:
for i, images in enumerate(classified_images):
    print(f"Classification {i}")
    
    images_pil = classified_images_pil[i]
    
    pdf_bytes = io.BytesIO()
    images_pil[0].save(pdf_bytes, format='PDF', resolution=100.0, save_all=True, append_images=images_pil[1:])
    pdf_bytes.seek(0)

    with Stopwatch() as di_stopwatch:
        if use_document_intelligence:
            poller = document_intelligence_client.begin_analyze_document(
                model_id="prebuilt-layout",
                body=pdf_bytes,
                output_content_format=DocumentContentFormat.MARKDOWN,
                content_type="application/pdf"
            )
            
            result: AnalyzeResult = poller.result()

            markdown = result.content
            
    # DEBUG: Display the document intelligence markdown content
    display(Markdown(markdown))
            
    extract_system_prompt = f"""You are an AI assistant that extracts data from documents."""

    # Prepare the user content for the OpenAI API including any specific details for processing this type of document, text, and the document page images.
    extract_user_content = []

    extract_user_prompt = """Extract the data from this invoice. 
    - If a value is not present, provide null.
    - It is possible that there are multiple invoices in the same document across multiple pages.
    - Some values must be inferred based on the content defined in the invoice.
    - Dates should be in the format YYYY-MM-DD."""

    extract_user_content.append({
        "type": "text",
        "text": extract_user_prompt
    })

    if use_document_intelligence:
        extract_user_content.append({
            "type": "text",
            "text": markdown
        })
        
    extract_user_content.extend(images)

    with Stopwatch() as oai_stopwatch:
        completion = openai_client.beta.chat.completions.parse(
            model=app_settings.azure_openai_chat_deployment,
            messages=[
                {
                    "role": "system",
                    "content": extract_system_prompt,
                },
                {
                    "role": "user",
                    "content": extract_user_content
                }
            ],
            response_format=Invoice,
            max_tokens=4096,
            temperature=0.1,
            top_p=0.1,
            logprobs=True # Enabled to determine the confidence of the response.
        )
        
    # Gets the parsed Invoice object from the completion response.
    invoice = completion.choices[0].message.parsed

    expected_dict = expected_outputs[i].model_dump() if i < len(expected_outputs) else None
    invoice_dict = invoice.model_dump()
    
    if expected_dict is not None:
        accuracy = invoice_evaluator.evaluate(expected=expected_dict, actual=invoice_dict)
    else:
        accuracy = None

    # Determines the confidence of the extracted data.
    oai_confidence = evaluate_openai_confidence(invoice_dict, completion.choices[0])

    if use_document_intelligence:
        di_confidence = evaluate_di_confidence(invoice_dict, result)
        confidence = merge_confidence_values(di_confidence, oai_confidence)
    else:
        confidence = oai_confidence
        
    # DEBUG: Gets the total execution time of the data extraction process.
    total_elapsed = classify_stopwatch.elapsed + di_stopwatch.elapsed + image_stopwatch.elapsed + oai_stopwatch.elapsed

    # DEBUG: Gets the prompt tokens and completion tokens from the completion response.
    prompt_tokens = completion.usage.prompt_tokens
    completion_tokens = completion.usage.completion_tokens

    # DEBUG: Save the output of the data extraction result.
    extraction_result = DataExtractionResult(invoice_dict, confidence, accuracy, prompt_tokens, completion_tokens, total_elapsed)
    create_json_file(f"{scenario_path}/{scenario_name}.{pdf_fname}_{i}.json", extraction_result)
    
    display_accuracy = f"{accuracy['overall'] * 100:.2f}%" if accuracy is not None else "N/A"

    # DEBUG: Display the outputs of the data extraction process.
    df = pd.DataFrame([
        {
            "Document": pdf_fname,
            "Index": i,
            "Accuracy": display_accuracy,
            "Confidence": f"{confidence['_overall'] * 100:.2f}%",
            "Execution Time": f"{total_elapsed:.2f} seconds",
            "Classify Execution Time": f"{classify_stopwatch.elapsed:.2f} seconds",
            "Document Intelligence Execution Time": f"{di_stopwatch.elapsed:.2f} seconds",
            "Image Pre-processing Execution Time": f"{image_stopwatch.elapsed:.2f} seconds",
            "OpenAI Execution Time": f"{oai_stopwatch.elapsed:.2f} seconds",
            "Prompt Tokens": prompt_tokens,
            "Completion Tokens": completion_tokens
        }
    ])

    display(df)
    
    if accuracy is not None:
        display(get_extraction_comparison(expected_dict, invoice_dict, confidence, accuracy['accuracy']))

Classification 0


NEXGEN
Innovation Drives Progress

Unit 4, Innovation Drive

Milton Keynes

Buckinghamshire

MK9 1FH

Phone: +44 (0)1234 567890
info@nexgen.co.uk

BILL TO

Alpha Tech Solutions

22 Regent Street

London

SWIY 4PD

SHIP TO

AlphaTech Solutions

Warehouse 5, Met Business Park

Leicester

LE19 1WD


<table>
<tr>
<th>Invoice No.</th>
<th>Invoice Date</th>
<th>Ship Date</th>
<th>Purchase Order</th>
<th>Customer Ref.</th>
</tr>
<tr>
<td>NG-INV-2025-00123</td>
<td>2025-04-24</td>
<td>2025-04-19</td>
<td>PO-556423</td>
<td>ALTEC-SLN</td>
</tr>
</table>


<table>
<tr>
<th>Product Code</th>
<th>Description</th>
<th>Units</th>
<th>Excl.</th>
<th>VAT</th>
<th>Incl.</th>
</tr>
<tr>
<td>10323</td>
<td>MAILBOX 25PK</td>
<td>245</td>
<td>3787.70</td>
<td>568.16</td>
<td>☒ 4355.86</td>
</tr>
</table>


15%

COMMENTS
Expect payment by 6 May !

REMIT PAYMENT TO

NEXGEN

Unit 4, Innovation Drive

Milton Keynes

Buckinghamshire

MK9 1FH


<table>
<tr>
<td rowspan="2">Name:</td>
<td>Julia Patel</td>
</tr>
<tr>
<td></td>
</tr>
<tr>
<td>Signature:</td>
<td>TRã</td>
</tr>
<tr>
<td colspan="2">29/4/25 Date:</td>
</tr>
</table>


** Please sign and return to info@nexgen.co.uk **
Recipient has 30 days from date of issue of this invoice to render queries, thereafter all queries will be null and void
REGULATIONS REQUIRE PAYMENT WITHIN 45 DAYS

<!-- PageBreak -->


<table>
<tr>
<th>Product</th>
<th>Location</th>
<th>Per Unit</th>
<th>Units</th>
<th>Excl.</th>
</tr>
<tr>
<td>10323 - Mailbox 25pk</td>
<td>1011</td>
<td>15.46</td>
<td>11</td>
<td>170.06</td>
</tr>
<tr>
<td>245x 25 = 615 total.</td>
<td>1012</td>
<td>15.46</td>
<td>5</td>
<td>77.30</td>
</tr>
<tr>
<td></td>
<td>1025</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>1028</td>
<td>15.46</td>
<td>1</td>
<td>15.46</td>
</tr>
<tr>
<td></td>
<td>1029</td>
<td>15.46</td>
<td>9</td>
<td>139.14</td>
</tr>
<tr>
<td></td>
<td>1042</td>
<td>15.46</td>
<td>10</td>
<td>154.60</td>
</tr>
<tr>
<td></td>
<td>1044</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>1048</td>
<td>15.46</td>
<td>15</td>
<td>231.90</td>
</tr>
<tr>
<td></td>
<td>1050</td>
<td>15.46</td>
<td>3</td>
<td>46.38</td>
</tr>
<tr>
<td></td>
<td>1055</td>
<td>15.46</td>
<td>13</td>
<td>200.98</td>
</tr>
<tr>
<td></td>
<td>1059</td>
<td>15.46</td>
<td>11</td>
<td>170.06</td>
</tr>
<tr>
<td></td>
<td>1060</td>
<td>15.46</td>
<td>9</td>
<td>139.14</td>
</tr>
<tr>
<td></td>
<td>1073</td>
<td>15.46</td>
<td>16</td>
<td>247.36</td>
</tr>
<tr>
<td></td>
<td>1077</td>
<td>15.46</td>
<td>5</td>
<td>77.30</td>
</tr>
<tr>
<td></td>
<td>1085</td>
<td>15.46</td>
<td>3</td>
<td>46.38</td>
</tr>
<tr>
<td></td>
<td>2042</td>
<td>15.46</td>
<td>6</td>
<td>92.76</td>
</tr>
<tr>
<td></td>
<td>2051</td>
<td>15.46</td>
<td>10</td>
<td>154.60</td>
</tr>
<tr>
<td></td>
<td>2054</td>
<td>15.46</td>
<td>11</td>
<td>170.06</td>
</tr>
<tr>
<td></td>
<td>2059</td>
<td>15.46</td>
<td>9</td>
<td>139.14</td>
</tr>
<tr>
<td></td>
<td>3001</td>
<td>15.46</td>
<td>15</td>
<td>231.90</td>
</tr>
<tr>
<td></td>
<td>3009</td>
<td>15.46</td>
<td>3</td>
<td>46.38</td>
</tr>
<tr>
<td></td>
<td>3010</td>
<td>15.46</td>
<td>2</td>
<td>30.92</td>
</tr>
<tr>
<td></td>
<td>3046</td>
<td>15.46</td>
<td>14</td>
<td>216.44</td>
</tr>
<tr>
<td></td>
<td>3063</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>3071</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>3078</td>
<td>15.46</td>
<td>5</td>
<td>77.30</td>
</tr>
<tr>
<td></td>
<td>3100</td>
<td>15.46</td>
<td>3</td>
<td>46.38</td>
</tr>
<tr>
<td></td>
<td>5911</td>
<td>15.46</td>
<td>13</td>
<td>200.98</td>
</tr>
<tr>
<td></td>
<td>6420</td>
<td>15.46</td>
<td>6</td>
<td>92.76</td>
</tr>
<tr>
<td></td>
<td>6431</td>
<td>15.46</td>
<td>10</td>
<td>154.60</td>
</tr>
<tr>
<td></td>
<td>7101</td>
<td>15.46</td>
<td>2</td>
<td>30.92</td>
</tr>
<tr>
<td></td>
<td>7110</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>7337</td>
<td>15.46</td>
<td>4</td>
<td>61.84</td>
</tr>
<tr>
<td></td>
<td>9009</td>
<td>15.46</td>
<td>1</td>
<td>15.46</td>
</tr>
<tr>
<td>Total</td>
<td></td>
<td></td>
<td>245</td>
<td>£3787.70</td>
</tr>
</table>


Unnamed: 0,Document,Index,Accuracy,Confidence,Execution Time,Classify Execution Time,Document Intelligence Execution Time,Image Pre-processing Execution Time,OpenAI Execution Time,Prompt Tokens,Completion Tokens
0,invoice_6.pdf,0,91.23%,95.94%,32.58 seconds,10.57 seconds,6.11 seconds,2.65 seconds,13.23 seconds,5715,350


Unnamed: 0,Field,Expected,Extracted,Confidence,Accuracy
0,customer_address_city,London,London,99.60%,Match
1,customer_address_country,UK,,0.00%,Mismatch
2,customer_address_postal_code,SWIY 4PD,SWIY 4PD,52.30%,Match
3,customer_address_state,,,0.00%,Match
4,customer_address_street,22 Regent Street,22 Regent Street,99.60%,Match
5,customer_name,AlphaTech Solutions,Alpha Tech Solutions,98.57%,Mismatch
6,customer_signature_date,2025-04-29,2025-04-29,98.95%,Match
7,customer_signature_has_written_signature,True,True,0.00%,Match
8,customer_signature_signatory,Julia Patel,Julia Patel,86.50%,Match
9,customer_tax_id,,,0.00%,Match
