## Setup

In [1]:
import sys
import os
from pathlib import Path
wdir = os.path.abspath('../../../../')
sys.path.append(os.path.join(wdir, 'samples/python/modules/')) # Import local modules

from IPython.display import display, Markdown
import os
import pandas as pd
from dotenv import dotenv_values
import base64
import io
import json
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentContentFormat
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_bytes

from samples.app_settings import AppSettings
from samples.utils.stopwatch import Stopwatch
from samples.utils.storage_utils import create_json_file
from samples.models.document_processing_result import DataExtractionResult

from samples.models.classification import Classifications
from samples.confidence.confidence_utils import merge_confidence_values
from samples.confidence.openai_confidence import evaluate_confidence as evaluate_openai_confidence
from samples.confidence.document_intelligence_confidence import evaluate_confidence as evaluate_di_confidence
from samples.evaluation.accuracy_evaluator import AccuracyEvaluator
from samples.evaluation.comparison import get_extraction_comparison

In [2]:
app_settings = AppSettings(dotenv_values(Path(os.path.join(wdir, '.env'))))
scenario_path = Path(os.path.join(wdir, 'samples/python/scenarios/us_tax/'))
scenario_name = "us-tax-1040-extraction"

# Configure the default credential for accessing Azure services using Azure CLI credentials
credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=app_settings.openai_endpoint,
    azure_ad_token_provider=openai_token_provider,
    api_version="2024-12-01-preview" # Requires the latest API version for structured outputs.
)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=app_settings.ai_services_endpoint,
    credential=credential
)

In [None]:
inputs_path = Path(os.path.join(wdir, 'samples/assets/us_tax/'))
metadata_fname = "us-tax-1040.json" # Change this to the file you want to evaluate
metadata_fpath = Path(os.path.join(inputs_path, metadata_fname))

with open(metadata_fpath, 'r') as f:
    metadata = json.load(f)

pdf_fname = metadata['fname']
pdf_fpath = Path(os.path.join(inputs_path, pdf_fname))

us_tax_1040_evaluator = AccuracyEvaluator(match_keys=[])

In [4]:
page_images = []

def encode_page(page):
    byte_io = io.BytesIO()
    page.save(byte_io, format='PNG')
    base64_data = base64.b64encode(byte_io.getvalue()).decode('utf-8')
    return {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/png;base64,{base64_data}"
        }
    }

with Stopwatch() as image_stopwatch:
    with open(pdf_fpath, "rb") as f:
        document_bytes = f.read()

    pages = convert_from_bytes(document_bytes)
    
    # Process each page in parallel using multiple processes
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(encode_page, pages))
        page_images.extend(results)

## Classify

In [5]:
classifications = [
    {
        "classification": "US Tax Form 1040",
        "description": "A US individual income tax return form, used to report annual income tax return filed by citizens or residents of the United States.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 1",
        "description": "A form used to report additional income and adjustments to income.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 2",
        "description": "A form used to report additional taxes.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 3",
        "description": "A form used to report nonrefundable credits.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule A",
        "description": "A form used to report itemized deductions for individual income tax returns.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule B",
        "description": "A form used to report interest and ordinary dividends.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule C",
        "description": "A form used to report income or loss from a business operated or a profession practiced as a sole proprietor.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule D",
        "description": "A form used to report capital gains and losses.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule E",
        "description": "A form used to report income or loss from rental real estate, royalties, partnerships, S corporations, estates, trusts, etc.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule EIC",
        "description": "A form used to claim the Earned Income Credit (EIC).",
    },
    {
        "classification": "US Tax Form 1040 - Schedule F",
        "description": "A form used to report profit or loss from farming.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule H",
        "description": "A form used to report household employment taxes.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule J",
        "description": "A form used to figure the tax on a qualified farmer's or fisherman's income.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule R",
        "description": "A form used to claim the Credit for the Elderly or the Disabled.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule SE",
        "description": "A form used to calculate self-employment tax.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 8812",
        "description": "A form used to claim the Child Tax Credit and Credit for Other Dependents.",
    }
]

In [6]:
classification_model_map = {
    "US Tax Form 1040": "prebuilt-tax.us.1040",
    "US Tax Form 1040 - Schedule 1": "prebuilt-tax.us.1040Schedule1",
    "US Tax Form 1040 - Schedule 2": "prebuilt-tax.us.1040Schedule2",
    "US Tax Form 1040 - Schedule 3": "prebuilt-tax.us.1040Schedule3",
    "US Tax Form 1040 - Schedule A": "prebuilt-tax.us.1040ScheduleA",
    "US Tax Form 1040 - Schedule B": "prebuilt-tax.us.1040ScheduleB",
    "US Tax Form 1040 - Schedule C": "prebuilt-tax.us.1040ScheduleC",
    "US Tax Form 1040 - Schedule D": "prebuilt-tax.us.1040ScheduleD",
    "US Tax Form 1040 - Schedule E": "prebuilt-tax.us.1040ScheduleE",
    "US Tax Form 1040 - Schedule EIC": "prebuilt-tax.us.1040ScheduleEIC",
    "US Tax Form 1040 - Schedule F": "prebuilt-tax.us.1040ScheduleF",
    "US Tax Form 1040 - Schedule H": "prebuilt-tax.us.1040ScheduleH",
    "US Tax Form 1040 - Schedule J": "prebuilt-tax.us.1040ScheduleJ",
    "US Tax Form 1040 - Schedule R": "prebuilt-tax.us.1040ScheduleR",
    "US Tax Form 1040 - Schedule SE": "prebuilt-tax.us.1040ScheduleSE",
    "US Tax Form 1040 - Schedule 8812": "prebuilt-tax.us.1040Schedule8812"
}

In [7]:
classify_system_prompt = f"""You are an AI assistant that helps detect the boundaries of sub-section or sub-documents using the provided classifications.

## Classifications

{json.dumps(classifications)}
"""

In [8]:
classify_user_content = []

In [9]:
classify_user_prompt = f"""Classify documents that are a US Tax Form in the provided page images.
- A single classification may span multiple page images.
- A single page image may contain multiple classifications.
- If a page image does not contain a classification, ignore it."""

classify_user_content.append({
    "type": "text",
    "text": classify_user_prompt
})

# for each of the page images, add a text content block with the page index, and then another block with the image
for i, page_image in enumerate(page_images):
    classify_user_content.append({
        "type": "text",
        "text": f"Page {i + 1}:"
    })
    classify_user_content.append(page_image)

In [10]:
with Stopwatch() as classify_stopwatch:
    classify_completion = openai_client.beta.chat.completions.parse(
        model=app_settings.gpt4o_model_deployment_name,
        messages=[
            {
                "role": "system",
                "content": classify_system_prompt,
            },
            {
                "role": "user",
                "content": classify_user_content
            }
        ],
        response_format=Classifications,
        max_tokens=4096,
        temperature=0.1,
        top_p=0.1,
        logprobs=True # Enabled to determine the confidence of the response.
    )

In [11]:
document_classifications = classify_completion.choices[0].message.parsed

In [12]:
# DEBUG: Display the document classifications
print(document_classifications.model_dump_json(indent=2))

{
  "classifications": [
    {
      "classification": "US Tax Form 1040",
      "image_range_start": 1,
      "image_range_end": 2
    },
    {
      "classification": "US Tax Form 1040 - Schedule 1",
      "image_range_start": 3,
      "image_range_end": 4
    },
    {
      "classification": "US Tax Form 1040 - Schedule 2",
      "image_range_start": 5,
      "image_range_end": 6
    }
  ]
}


In [13]:
# For each document classification, group the page images together
classified_images = []
classified_images_pil = []

for classification in document_classifications.classifications:
    image_range_start = classification.image_range_start
    image_range_end = classification.image_range_end

    classified_images.append(page_images[image_range_start - 1:image_range_end])
    classified_images_pil.append(pages[image_range_start - 1:image_range_end])

## Extract

In [15]:
for i, images in enumerate(classified_images):
    classification = document_classifications.classifications[i]
    
    print(f"Classification {i} - {classification.classification}")
    
    images_pil = classified_images_pil[i]
    
    pdf_bytes = io.BytesIO()
    images_pil[0].save(pdf_bytes, format='PDF', resolution=100.0, save_all=True, append_images=images_pil[1:])
    pdf_bytes.seek(0)
    
    # Use the classification of the document to determine the Azure AI Document Intelligence model to use
    model_id = classification_model_map[classification.classification]
        
    with Stopwatch() as di_stopwatch:
        poller = document_intelligence_client.begin_analyze_document(
            model_id=model_id,
            body=pdf_bytes,
            content_type="application/pdf"
        )
        
        result: AnalyzeResult = poller.result()

        doc_result = result.documents[0].fields
            
    # DEBUG: Display the document intelligence content
    display(doc_result)

Classification 0 - US Tax Form 1040


{'TaxYear': {'type': 'string', 'valueString': '2024', 'content': '2024', 'boundingRegions': [{'pageNumber': 1, 'polygon': [8.2608, 1.0557, 9.7507, 1.0532, 9.7516, 1.5611, 8.2616, 1.5636]}], 'confidence': 0.971, 'spans': [{'offset': 96, 'length': 4}]},
 'Taxpayer': {'type': 'object', 'valueObject': {'SSN': {'type': 'string', 'valueString': '400-12-3456', 'content': '4 0 0 1 2 3 4 5 6', 'boundingRegions': [{'pageNumber': 1, 'polygon': [13.103, 2.458, 15.9378, 2.4605, 15.9375, 2.7561, 13.1027, 2.7536]}], 'confidence': 0.925, 'spans': [{'offset': 395, 'length': 17}]}, 'LastName': {'type': 'string', 'valueString': 'Sample', 'content': 'Sample', 'boundingRegions': [{'pageNumber': 1, 'polygon': [6.6737, 2.488, 7.4937, 2.491, 7.4928, 2.7458, 6.6728, 2.7427]}], 'confidence': 0.975, 'spans': [{'offset': 388, 'length': 6}]}, 'FirstNameAndInitials': {'type': 'string', 'valueString': 'John Q.', 'content': 'John Q.', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.034, 2.4754, 1.8797, 2.4776, 1.

Classification 1 - US Tax Form 1040 - Schedule 1


{'TaxYear': {'type': 'string', 'valueString': '2024', 'content': '2024', 'boundingRegions': [{'pageNumber': 1, 'polygon': [14.0631, 1.3724, 15.5614, 1.3729, 15.5612, 1.8601, 14.0629, 1.8596]}], 'confidence': 0.967, 'spans': [{'offset': 127, 'length': 4}]},
 'Taxpayer': {'type': 'object', 'valueObject': {'SSN': {'type': 'string', 'valueString': '400-12-3456', 'content': '400-123-456', 'boundingRegions': [{'pageNumber': 1, 'polygon': [13.5823, 2.6512, 14.8755, 2.6541, 14.875, 2.8898, 13.5818, 2.8869]}], 'confidence': 0.953, 'spans': [{'offset': 374, 'length': 11}]}, 'Name': {'type': 'string', 'valueString': 'John Q. Sample', 'content': 'John Q. Sample', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.0255, 2.6385, 2.7572, 2.6468, 2.7559, 2.9207, 1.0242, 2.9124]}], 'confidence': 0.884, 'spans': [{'offset': 331, 'length': 14}]}}},
 'Box1': {'type': 'number', 'valueNumber': 0, 'content': '0.00', 'boundingRegions': [{'pageNumber': 1, 'polygon': [15.5104, 4.8573, 15.9762, 4.8561, 15.9767,

Classification 2 - US Tax Form 1040 - Schedule 2


{'TaxYear': {'type': 'string', 'valueString': '2024', 'content': '2024', 'boundingRegions': [{'pageNumber': 1, 'polygon': [14.0595, 1.3629, 15.5615, 1.3639, 15.5612, 1.8601, 14.0592, 1.8591]}], 'confidence': 0.974, 'spans': [{'offset': 224, 'length': 4}]},
 'Taxpayer': {'type': 'object', 'valueObject': {'SSN': {'type': 'string', 'valueString': '400-12-3456', 'content': '400-123-456', 'boundingRegions': [{'pageNumber': 1, 'polygon': [13.5756, 2.6427, 14.8754, 2.6449, 14.875, 2.8898, 13.5752, 2.8876]}], 'confidence': 0.963, 'spans': [{'offset': 347, 'length': 11}]}, 'Name': {'type': 'string', 'valueString': 'John Q. Sample', 'content': 'John Q. Sample', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.0192, 2.6288, 2.7564, 2.6367, 2.7551, 2.922, 1.0179, 2.914]}], 'confidence': 0.918, 'spans': [{'offset': 304, 'length': 14}]}}},
 'Box1': {'type': 'string', 'confidence': 0.713},
 'Box2': {'type': 'number', 'valueNumber': 0, 'content': '0.00', 'boundingRegions': [{'pageNumber': 1, 'polyg