## Setup

**Before running this notebook, ensure you have selected the correct Python kernel. If running in the `devcontainer` environment, this is likely to be 3.12.11 at `/usr/local/python/current/bin/python`.**

![Example devcontainer notebook kernel](../../../../images/python-notebook-kernel.png)

In [1]:
import sys
import os
from pathlib import Path
wdir = os.path.abspath('../../../../')
sys.path.append(os.path.join(wdir, 'samples/python/modules/')) # Import local modules

from IPython.display import display
import os
from dotenv import dotenv_values
import base64
import io
import json
import pandas as pd
from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_bytes

from samples.app_settings import AppSettings
from samples.utils.stopwatch import Stopwatch

from samples.models.classification import Classifications
from samples.utils.document_intelligence_result_parser import parse_document_fields
from samples.utils.custom_json_encoder import CustomJsonEncoder
from samples.confidence.document_intelligence_confidence import evaluate_confidence as evaluate_di_confidence
from samples.evaluation.accuracy_evaluator import AccuracyEvaluator
from samples.evaluation.comparison import get_extraction_comparison

In [2]:
app_settings = AppSettings(dotenv_values(Path(os.path.join(wdir, '.env'))))
scenario_path = Path(os.path.join(wdir, 'samples/python/scenarios/us_tax/'))
scenario_name = "us-tax-1040-extraction"

# Configure the default credential for accessing Azure services using Azure CLI credentials
credential = DefaultAzureCredential(
    exclude_workload_identity_credential=True,
    exclude_developer_cli_credential=True,
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_powershell_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True
)

openai_token_provider = get_bearer_token_provider(credential, 'https://cognitiveservices.azure.com/.default')

openai_client = AzureOpenAI(
    azure_endpoint=app_settings.azure_openai_endpoint,
    azure_ad_token_provider=openai_token_provider,
    api_version=app_settings.azure_openai_api_version
)

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=app_settings.azure_ai_services_endpoint,
    credential=credential
)

In [3]:
inputs_path = Path(os.path.join(wdir, 'samples/assets/us_tax/'))
metadata_fname = "us-tax-1040.json" # Change this to the file you want to evaluate
metadata_fpath = Path(os.path.join(inputs_path, metadata_fname))

with open(metadata_fpath, 'r') as f:
    metadata = json.load(f)
    
expected_outputs = [
    metadata[key]
    for key in metadata.keys() if key.endswith('_expected')
]

pdf_fname = metadata['fname']
pdf_fpath = Path(os.path.join(inputs_path, pdf_fname))

evaluator = AccuracyEvaluator()

In [4]:
page_images = []

def encode_page(page):
    byte_io = io.BytesIO()
    page.save(byte_io, format='PNG')
    base64_data = base64.b64encode(byte_io.getvalue()).decode('utf-8')
    return {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/png;base64,{base64_data}"
        }
    }

with Stopwatch() as image_stopwatch:
    with open(pdf_fpath, "rb") as f:
        document_bytes = f.read()

    pages = convert_from_bytes(document_bytes)
    
    # Process each page in parallel using multiple processes
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(encode_page, pages))
        page_images.extend(results)

## Classify

In [5]:
classifications = [
    {
        "classification": "US Tax Form 1040",
        "description": "A US individual income tax return form, used to report annual income tax return filed by citizens or residents of the United States.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 1",
        "description": "A form used to report additional income and adjustments to income.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 2",
        "description": "A form used to report additional taxes.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 3",
        "description": "A form used to report nonrefundable credits.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule A",
        "description": "A form used to report itemized deductions for individual income tax returns.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule B",
        "description": "A form used to report interest and ordinary dividends.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule C",
        "description": "A form used to report income or loss from a business operated or a profession practiced as a sole proprietor.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule D",
        "description": "A form used to report capital gains and losses.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule E",
        "description": "A form used to report income or loss from rental real estate, royalties, partnerships, S corporations, estates, trusts, etc.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule EIC",
        "description": "A form used to claim the Earned Income Credit (EIC).",
    },
    {
        "classification": "US Tax Form 1040 - Schedule F",
        "description": "A form used to report profit or loss from farming.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule H",
        "description": "A form used to report household employment taxes.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule J",
        "description": "A form used to figure the tax on a qualified farmer's or fisherman's income.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule R",
        "description": "A form used to claim the Credit for the Elderly or the Disabled.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule SE",
        "description": "A form used to calculate self-employment tax.",
    },
    {
        "classification": "US Tax Form 1040 - Schedule 8812",
        "description": "A form used to claim the Child Tax Credit and Credit for Other Dependents.",
    }
]

In [6]:
classification_model_map = {
    "US Tax Form 1040": "prebuilt-tax.us.1040",
    "US Tax Form 1040 - Schedule 1": "prebuilt-tax.us.1040Schedule1",
    "US Tax Form 1040 - Schedule 2": "prebuilt-tax.us.1040Schedule2",
    "US Tax Form 1040 - Schedule 3": "prebuilt-tax.us.1040Schedule3",
    "US Tax Form 1040 - Schedule A": "prebuilt-tax.us.1040ScheduleA",
    "US Tax Form 1040 - Schedule B": "prebuilt-tax.us.1040ScheduleB",
    "US Tax Form 1040 - Schedule C": "prebuilt-tax.us.1040ScheduleC",
    "US Tax Form 1040 - Schedule D": "prebuilt-tax.us.1040ScheduleD",
    "US Tax Form 1040 - Schedule E": "prebuilt-tax.us.1040ScheduleE",
    "US Tax Form 1040 - Schedule EIC": "prebuilt-tax.us.1040ScheduleEIC",
    "US Tax Form 1040 - Schedule F": "prebuilt-tax.us.1040ScheduleF",
    "US Tax Form 1040 - Schedule H": "prebuilt-tax.us.1040ScheduleH",
    "US Tax Form 1040 - Schedule J": "prebuilt-tax.us.1040ScheduleJ",
    "US Tax Form 1040 - Schedule R": "prebuilt-tax.us.1040ScheduleR",
    "US Tax Form 1040 - Schedule SE": "prebuilt-tax.us.1040ScheduleSE",
    "US Tax Form 1040 - Schedule 8812": "prebuilt-tax.us.1040Schedule8812"
}

In [7]:
classify_system_prompt = f"""You are an AI assistant that helps detect the boundaries of sub-section or sub-documents using the provided classifications.

## Classifications

{json.dumps(classifications)}
"""

In [8]:
classify_user_content = []

In [9]:
classify_user_prompt = f"""Classify documents that are a US Tax Form in the provided page images.
- A single classification may span multiple page images.
- A single page image may contain multiple classifications.
- If a page image does not contain a classification, ignore it."""

classify_user_content.append({
    "type": "text",
    "text": classify_user_prompt
})

# for each of the page images, add a text content block with the page index, and then another block with the image
for i, page_image in enumerate(page_images):
    classify_user_content.append({
        "type": "text",
        "text": f"Page {i + 1}:"
    })
    classify_user_content.append(page_image)

In [10]:
with Stopwatch() as classify_stopwatch:
    classify_completion = openai_client.beta.chat.completions.parse(
        model=app_settings.azure_openai_chat_deployment,
        messages=[
            {
                "role": "system",
                "content": classify_system_prompt,
            },
            {
                "role": "user",
                "content": classify_user_content
            }
        ],
        response_format=Classifications,
        max_tokens=4096,
        temperature=0.1,
        top_p=0.1,
        logprobs=True # Enabled to determine the confidence of the response.
    )

In [11]:
document_classifications = classify_completion.choices[0].message.parsed

In [12]:
# DEBUG: Display the document classifications
print(document_classifications.model_dump_json(indent=2))

{
  "classifications": [
    {
      "classification": "US Tax Form 1040",
      "image_range_start": 1,
      "image_range_end": 2
    },
    {
      "classification": "US Tax Form 1040 - Schedule 1",
      "image_range_start": 3,
      "image_range_end": 4
    },
    {
      "classification": "US Tax Form 1040 - Schedule 2",
      "image_range_start": 5,
      "image_range_end": 6
    }
  ]
}


In [13]:
# For each document classification, group the page images together
classified_images = []
classified_images_pil = []

for classification in document_classifications.classifications:
    image_range_start = classification.image_range_start
    image_range_end = classification.image_range_end

    classified_images.append(page_images[image_range_start - 1:image_range_end])
    classified_images_pil.append(pages[image_range_start - 1:image_range_end])

## Extract

In [14]:
for i, images in enumerate(classified_images):
    classification = document_classifications.classifications[i]
    
    display(f"Classification {i} - {classification.classification}")
    
    images_pil = classified_images_pil[i]
    
    pdf_bytes = io.BytesIO()
    images_pil[0].save(pdf_bytes, format='PDF', resolution=100.0, save_all=True, append_images=images_pil[1:])
    pdf_bytes.seek(0)
    
    # DEBUG: Save the PDF to a temporary file
    pdf_fpath = Path(os.path.join(scenario_path, f"{pdf_fname}_{classification.classification}.pdf"))
    with open(pdf_fpath, "wb") as f:
        f.write(pdf_bytes.read())
    pdf_bytes.seek(0)
    
    # Use the classification of the document to determine the Azure AI Document Intelligence model to use
    model_id = classification_model_map[classification.classification]
        
    with Stopwatch() as di_stopwatch:
        poller = document_intelligence_client.begin_analyze_document(
            model_id=model_id,
            body=pdf_bytes,
            content_type="application/pdf"
        )
        
        result: AnalyzeResult = poller.result()

        doc_result = result.documents[0].fields
            
    doc_result_dict = parse_document_fields(doc_result)
    expected_dict = expected_outputs[i] if i < len(expected_outputs) else None
    
    # DEBUG: Display the document intelligence content
    doc_result_json = json.dumps(doc_result_dict, indent=2)
    print(doc_result_json)
    
    if expected_dict is not None:
        accuracy = evaluator.evaluate(expected=expected_dict, actual=doc_result_dict)
    else:
        accuracy = None

    confidence = evaluate_di_confidence(doc_result_dict, result)
    
    # DEBUG: Gets the total execution time of the data extraction process.
    total_elapsed = classify_stopwatch.elapsed + di_stopwatch.elapsed + image_stopwatch.elapsed
        
    # DEBUG: Save both the original doc_result and the parsed doc_result to a file
    di_result_fpath = Path(os.path.join(scenario_path, f"{pdf_fname}_{classification.classification}_original.json"))
    with open(di_result_fpath, "w") as f:
        json.dump(doc_result, f, indent=2, cls=CustomJsonEncoder)
        
    di_result_parsed_fpath = Path(os.path.join(scenario_path, f"{pdf_fname}_{classification.classification}_parsed.json"))
    with open(di_result_parsed_fpath, "w") as f:
        json.dump(parse_document_fields(doc_result), f, indent=2)
        
    display_accuracy = f"{accuracy['overall'] * 100:.2f}%" if accuracy is not None else "N/A"
    
    # DEBUG: Display the outputs of the data extraction process.
    df = pd.DataFrame([
        {
            "Document": pdf_fname,
            "Index": i,
            "Accuracy": display_accuracy,
            "Confidence": f"{confidence['_overall'] * 100:.2f}%",
            "Execution Time": f"{total_elapsed:.2f} seconds",
            "Classify Execution Time": f"{classify_stopwatch.elapsed:.2f} seconds",
            "Document Intelligence Execution Time": f"{di_stopwatch.elapsed:.2f} seconds",
            "Image Pre-processing Execution Time": f"{image_stopwatch.elapsed:.2f} seconds"
        }
    ])

    display(df)
    
    if accuracy is not None:
        display(get_extraction_comparison(expected_dict, doc_result_dict, confidence, accuracy['accuracy']))

'Classification 0 - US Tax Form 1040'

{
  "TaxYear": "2024",
  "Taxpayer": {
    "SSN": "400-12-3456",
    "LastName": "Sample",
    "FirstNameAndInitials": "John Q.",
    "Address": {
      "houseNumber": "123",
      "road": "Maple Drive",
      "postalCode": "62704",
      "city": "Springfield",
      "state": "IL",
      "streetAddress": "123 Maple Drive"
    },
    "ForeignCountryName": "",
    "ForeignProvinceStateOrCounty": "",
    "ForeignPostalCode": ""
  },
  "Spouse": {
    "SSN": "400-65-4321",
    "LastName": "Sample",
    "FirstNameAndInitials": "Jane Q."
  },
  "Dependents": [
    {
      "Name": "Jack Sample",
      "SSN": "400-00-0001",
      "RelationshipToFiler": "Son",
      "CreditType": [
        "ChildTaxCredit"
      ]
    },
    {
      "Name": "Jill Sample",
      "SSN": "400-00-0002",
      "RelationshipToFiler": "Daughter",
      "CreditType": [
        "ChildTaxCredit"
      ]
    },
    {
      "Name": "",
      "SSN": "",
      "RelationshipToFiler": "",
      "CreditType": []
    },
    {
  

Unnamed: 0,Document,Index,Accuracy,Confidence,Execution Time,Classify Execution Time,Document Intelligence Execution Time,Image Pre-processing Execution Time
0,us-tax-1040.pdf,0,100.00%,92.36%,18.15 seconds,6.34 seconds,10.98 seconds,0.83 seconds


Unnamed: 0,Field,Expected,Extracted,Confidence,Accuracy
0,Box10,10232,10232,,Match
1,Box11,156708,156708,,Match
2,Box12,29200,29200,,Match
3,Box13,,,,Match
4,Box14,29200,29200,,Match
5,Box15,127508,127508,,Match
6,Box16,18667,18667,,Match
7,Box16OtherFormNumber,,,,Match
8,Box17,0,0,,Match
9,Box18,18667,18667,,Match


'Classification 1 - US Tax Form 1040 - Schedule 1'

{
  "TaxYear": "2024",
  "Taxpayer": {
    "SSN": "400-12-3456",
    "Name": "John Q. Sample"
  },
  "Box1": 0,
  "Box2a": 0,
  "Box2b": "",
  "Box3": 5400,
  "Box4": 0,
  "Box5": 0,
  "Box6": 0,
  "Box7": 0,
  "Box8a": 0,
  "Box8b": 0,
  "Box8c": 0,
  "Box8d": 0,
  "Box8e": 0,
  "Box8f": 0,
  "Box8g": 0,
  "Box8h": 0,
  "Box8i": 0,
  "Box8j": 0,
  "Box8k": 0,
  "Box8l": 0,
  "Box8m": 0,
  "Box8n": 0,
  "Box8o": 0,
  "Box8p": 0,
  "Box8q": 0,
  "Box8r": 0,
  "Box8s": 0,
  "Box8t": 0,
  "Box8u": 0,
  "Box8z": 0,
  "Box9": 0,
  "Box10": 5400,
  "Box11": 0,
  "Box12": 0,
  "Box13": 3850,
  "Box14": 0,
  "Box15": 382,
  "Box16": 0,
  "Box17": 0,
  "Box18": 0,
  "Box19a": 0,
  "Box19b": "",
  "Box19c": "",
  "Box20": 6000,
  "Box21": 0,
  "Box23": 0,
  "Box24a": 0,
  "Box24b": 0,
  "Box24d": 0,
  "Box24e": 0,
  "Box24f": 0,
  "Box24g": 0,
  "Box24h": 0,
  "Box24i": 0,
  "Box24j": 0,
  "Box24k": 0,
  "Box24z": 0,
  "Box25": 10232,
  "Box26": 0
}


Unnamed: 0,Document,Index,Accuracy,Confidence,Execution Time,Classify Execution Time,Document Intelligence Execution Time,Image Pre-processing Execution Time
0,us-tax-1040.pdf,1,98.97%,98.20%,13.84 seconds,6.34 seconds,6.67 seconds,0.83 seconds


Unnamed: 0,Field,Expected,Extracted,Confidence,Accuracy
0,Box1,0,0,,Match
1,Box10,5400,5400,,Match
2,Box11,0,0,,Match
3,Box12,0,0,,Match
4,Box13,3850,3850,,Match
5,Box14,0,0,,Match
6,Box15,382,382,99.80%,Match
7,Box16,0,0,,Match
8,Box17,0,0,,Match
9,Box18,0,0,,Match


'Classification 2 - US Tax Form 1040 - Schedule 2'

{
  "TaxYear": "2024",
  "Taxpayer": {
    "SSN": "400-12-3456",
    "Name": "John Q. Sample"
  },
  "Box1": "",
  "Box2": 0,
  "Box3": 0,
  "Box4": 763,
  "Box5": 0,
  "Box6": 0,
  "Box7": 0,
  "Box8Checkbox": false,
  "Box8": 0,
  "Box9": 0,
  "Box10": 0,
  "Box11": 0,
  "Box12": 0,
  "Box13": 0,
  "Box14": 0,
  "Box15": 0,
  "Box16": 0,
  "Box17aExtraInfo": "",
  "Box17a": 0,
  "Box17b": 0,
  "Box17c": 0,
  "Box17d": 0,
  "Box17e": 0,
  "Box17f": 0,
  "Box17g": 0,
  "Box17h": 0,
  "Box17i": 0,
  "Box17j": 0,
  "Box17k": 0,
  "Box17l": 0,
  "Box17m": 0,
  "Box17n": 0,
  "Box17o": 0,
  "Box17p": 0,
  "Box17q": 0,
  "Box17z": 0,
  "Box18": 0,
  "Box20": 0,
  "Box21": 763
}


Unnamed: 0,Document,Index,Accuracy,Confidence,Execution Time,Classify Execution Time,Document Intelligence Execution Time,Image Pre-processing Execution Time
0,us-tax-1040.pdf,2,99.15%,98.95%,17.40 seconds,6.34 seconds,10.23 seconds,0.83 seconds


Unnamed: 0,Field,Expected,Extracted,Confidence,Accuracy
0,Box1,,,,Match
1,Box10,0,0,,Match
2,Box11,0,0,,Match
3,Box12,0,0,,Match
4,Box13,0,0,,Match
5,Box14,0,0,,Match
6,Box15,0,0,,Match
7,Box16,0,0,,Match
8,Box17a,0,0,,Match
9,Box17aExtraInfo,,,,Match
