In [28]:
import sys
sys.path.append('../../modules/') # Import local modules

from IPython.display import display, Markdown
import os
import pandas as pd
from dotenv import dotenv_values
import json
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentContentFormat
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from concurrent.futures import ThreadPoolExecutor

from samples.app_settings import AppSettings
from samples.utils.stopwatch import Stopwatch
from samples.utils.storage_utils import create_json_file
from samples.models.document_processing_result import DataExtractionResult

from samples.models.invoice import Invoice
from samples.confidence.confidence_utils import merge_confidence_values
from samples.confidence.openai_confidence import evaluate_confidence as evaluate_openai_confidence
from samples.confidence.document_intelligence_confidence import evaluate_confidence as evaluate_di_confidence
from samples.evaluation.accuracy_evaluator import AccuracyEvaluator
from samples.evaluation.comparison import get_extraction_comparison
from samples.utils.custom_json_encoder import CustomJsonEncoder

In [29]:
# Set the working directory to the root of the repo
working_dir = os.path.abspath('../../../../')
settings = AppSettings(dotenv_values(f"{working_dir}/.env"))
sample_path = f"{working_dir}/samples/python/demo/gp"
sample_name = "document-extraction-prebuilt-invoice"

# Configure the default credential for accessing Azure services using Azure CLI credentials
credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

endpoint = "https://di-gp-di-weu.cognitiveservices.azure.com/"

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=endpoint,
    credential=credential
)

In [30]:
path = f"{working_dir}/samples/assets/invoices/"
pdf_fname = 'invoice_6.pdf'
pdf_fpath = f"{path}{pdf_fname}"

In [31]:
with Stopwatch() as di_stopwatch:
    with open(pdf_fpath, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            model_id="prebuilt-invoice",
            body=f,
            content_type="application/pdf"
        )
    
    result: AnalyzeResult = poller.result()

In [32]:
result_json = json.dumps(result._data, cls=CustomJsonEncoder, indent=4)

display(Markdown(f"### Document Intelligence Result\nTook {di_stopwatch.elapsed:.2f} seconds"))
print(result_json)

### Document Intelligence Result
Took 14.99 seconds

{
    "apiVersion": "2024-11-30",
    "modelId": "prebuilt-invoice",
    "stringIndexType": "textElements",
    "content": "NEXGEN\nUnit 4, Innovation Drive\nPhone: +44 (0)1234 567890\nMilton Keynes\ninfo@nexgen.co.uk\nInnovation Drives Progress\nBuckinghamshire\nMK9 1FH\nBILL TO\nSHIP TO\nAlphaTech Solutions\nAlphaTech Solutions\n22 Regent Street\nWarehouse 5, Met Business Park\nLondon\nLeicester\nSWIY 4PD\nLE19 1WD\nInvoice No.\nInvoice Date\nShip Date\nPurchase Order\nCustomer Ref.\nNG-INV-2025-00123\n2025-04-24\n2025-04-19\nPO-556423\nALTEC-SLN\nProduct Code\nDescription\nUnits\nExcl.\nVAT\nIncl. :selected:\n10323\nMAILBOX 25PK\n245\n3787.70\n568.16\n4355.86\n15%\nCOMMENTS\nExpect payment by 6 May!\nREMIT PAYMENT TO\nName: Julia Patel\nNEXGEN\nUnit 4, Innovation Drive\nTR\u00e3o\nMilton Keynes\nBuckinghamshire\nSignature:\nMK9 1FH\n29/4/25\nDate:\n** Please sign and return to info@nexgen.co.uk **\nRecipient has 30 days from date of issue of this invoice to render queries, thereafte