# Compare content extraction and field extraction

## Create Azure AI Content Understanding Client

> The [AzureContentUnderstandingClient](python/content_understanding_client.py) is a utility class containing functions to interact with the Content Understanding API. 


Before the official release of the Content Understanding SDK, it can be regarded as a lightweight SDK.


In [1]:
import logging
import json
import os
import sys
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

In [2]:
load_dotenv(override=True, dotenv_path=find_dotenv())

True

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
AZURE_AI_ENDPOINT = os.getenv("AZURE_CU_ENDPOINT")
AZURE_AI_API_VERSION = os.getenv("AZURE_CU_API_VERSION", "2024-12-01-preview")
print(f"Current Azure Content Understanding endpoint: {AZURE_AI_ENDPOINT}")
print(f"Current Azure Content Understanding API version: {AZURE_AI_API_VERSION}")

Current Azure Content Understanding endpoint: https://ep-ai-services.services.ai.azure.com/
Current Azure Content Understanding API version: 2024-12-01-preview


In [5]:
# only if necessary, add the parent directory to the path to use shared modules
# parent_dir = Path(Path.cwd()).parent
# sys.path.append(str(parent_dir))

# import the utility class AzureContentUnderstandingClient, which is a wrapper around the Azure Content Understanding REST API client
from python.content_understanding_client import AzureContentUnderstandingClient

In [7]:
# consider running az login
credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")

INFO:azure.identity._credentials.environment:No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS


In [None]:
# As an alternative to the DefaultAzureCredential, you can register an App in Entra ID and use it client secret
# from azure.identity import ClientSecretCredential

# credential = ClientSecretCredential(
#     tenant_id=os.getenv("TENANT_ID"), 
#     client_id=os.getenv("CLIENT_ID"), 
#     client_secret=os.getenv("CLIENT_SECRET") 
# )

# # Token provider compatibile
# def token_provider(scopes=None, **kwargs):
#     if scopes is None:
#         scopes = ["https://cognitiveservices.azure.com/.default"] # original value
#     token = credential.get_token(*scopes)
#     return token.token

# # scopes = ["https://cognitiveservices.azure.com/.default"] # original value
# # token = credential.get_token(*scopes)
# # print(token.token)

In [8]:
client = AzureContentUnderstandingClient(
    endpoint=AZURE_AI_ENDPOINT,
    api_version=AZURE_AI_API_VERSION,
    token_provider=token_provider,
    x_ms_useragent="azure-ai-content-understanding-python/analyzer_management", # This header is used for sample usage telemetry, please comment out this line if you want to opt out.
)

# This cell printout INFO level logs about the token acquisition process (e.g. AzCli vs ClientSecretCredential)
# INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential
# INFO:azure.identity._internal.get_token_mixin:ClientSecretCredential.get_token succeeded

INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.21.0 Python/3.10.16 (Windows-10-10.0.26100-SP0)'
No body was attached to the request
INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential


## Content extraction

In [9]:
CONTENT_EXTRACTION_ANALYZER_ID = "content-extraction-analyzer"
CONTENT_EXTRACTION_TEMPLATE = "analyzer_templates/my_content_extraction.json"

response = client.begin_create_analyzer(CONTENT_EXTRACTION_ANALYZER_ID, analyzer_template_path=CONTENT_EXTRACTION_TEMPLATE)
result = client.poll_result(response)

print(json.dumps(result, indent=2))

INFO:python.content_understanding_client:Analyzer content-extraction-analyzer create request accepted.
INFO:python.content_understanding_client:Request result is ready after 0.00 seconds.


{
  "id": "086be6c1-ca10-4ac2-ae76-c174233d3c11",
  "status": "Succeeded",
  "result": {
    "analyzerId": "content-extraction-analyzer",
    "description": "Sample invoice analyzer",
    "createdAt": "2025-04-15T14:39:47Z",
    "lastModifiedAt": "2025-04-15T14:39:47Z",
    "config": {
      "returnDetails": true,
      "enableOcr": true,
      "enableLayout": true,
      "enableBarcode": false,
      "enableFormula": false,
      "disableContentFiltering": false
    },
    "fieldSchema": {},
    "status": "ready",
    "scenario": "document"
  }
}


In [10]:
ce_response = client.begin_analyze(CONTENT_EXTRACTION_ANALYZER_ID, "assets/docs/invoice-logic-apps-tutorial.pdf")
ce_result = client.poll_result(
        ce_response,
        timeout_seconds=60 * 60,
        polling_interval_seconds=1,
    )
json.dump(ce_result, sys.stdout, indent=2)

INFO:python.content_understanding_client:Analyzing file assets/docs/invoice-logic-apps-tutorial.pdf with analyzer: content-extraction-analyzer
INFO:python.content_understanding_client:Request c77e751a-831d-4c42-b85d-93ab71f57b54 in progress ...
INFO:python.content_understanding_client:Request c77e751a-831d-4c42-b85d-93ab71f57b54 in progress ...
INFO:python.content_understanding_client:Request c77e751a-831d-4c42-b85d-93ab71f57b54 in progress ...
INFO:python.content_understanding_client:Request result is ready after 3.84 seconds.


{
  "id": "c77e751a-831d-4c42-b85d-93ab71f57b54",
  "status": "Succeeded",
  "result": {
    "analyzerId": "content-extraction-analyzer",
    "apiVersion": "2024-12-01-preview",
    "createdAt": "2025-04-15T14:39:51Z",
    "contents": [
      {
        "markdown": "CONTOSO LTD.\n\n\n# INVOICE\n\nContoso Headquarters\n123 456th St\nNew York, NY, 10001\n\nINVOICE: INV-100\n\nINVOICE DATE: 11/15/2019\n\nDUE DATE: 12/15/2019\n\nCUSTOMER NAME: MICROSOFT CORPORATION\n\nSERVICE PERIOD: 10/14/2019 - 11/14/2019\n\nCUSTOMER ID: CID-12345\n\nMicrosoft Corp\n123 Other St,\nRedmond WA, 98052\n\nBILL TO:\n\nMicrosoft Finance\n\n123 Bill St,\n\nRedmond WA, 98052\n\nSHIP TO:\n\nMicrosoft Delivery\n\n123 Ship St,\n\nRedmond WA, 98052\n\nSERVICE ADDRESS:\nMicrosoft Services\n123 Service St,\nRedmond WA, 98052\n\n\n<table>\n<tr>\n<th>SALESPERSON</th>\n<th>P.O. NUMBER</th>\n<th>REQUISITIONER</th>\n<th>SHIPPED VIA</th>\n<th>F.O.B. POINT</th>\n<th>TERMS</th>\n</tr>\n<tr>\n<td></td>\n<td>PO-3333</td>\n<td></

In [11]:
# create the folder if it does not exist
os.makedirs("results/docs", exist_ok=True)

# dump the result to a file
with open("results/docs/content_extraction_analyzer_invoice.json", "w") as f:
    json.dump(ce_result, f, indent=2)

## Field extraction

In [12]:
FIELD_EXTRACTION_ANALYZER_ID = "field-extraction-analyzer"
FIELD_EXTRACTION_TEMPLATE = "analyzer_templates/my_invoice_simple_analyzer.json"

response = client.begin_create_analyzer(FIELD_EXTRACTION_ANALYZER_ID, analyzer_template_path=FIELD_EXTRACTION_TEMPLATE)
result = client.poll_result(response)

print(json.dumps(result, indent=2))

INFO:python.content_understanding_client:Analyzer field-extraction-analyzer create request accepted.
INFO:python.content_understanding_client:Request 1386850c-5d0c-4ae7-ab2b-cd23a6ddb15d in progress ...
INFO:python.content_understanding_client:Request 1386850c-5d0c-4ae7-ab2b-cd23a6ddb15d in progress ...
INFO:python.content_understanding_client:Request 1386850c-5d0c-4ae7-ab2b-cd23a6ddb15d in progress ...
INFO:python.content_understanding_client:Request result is ready after 6.97 seconds.


{
  "id": "1386850c-5d0c-4ae7-ab2b-cd23a6ddb15d",
  "status": "Succeeded",
  "result": {
    "analyzerId": "field-extraction-analyzer",
    "description": "Sample invoice analyzer",
    "createdAt": "2025-04-15T14:40:36Z",
    "lastModifiedAt": "2025-04-15T14:40:42Z",
    "config": {
      "returnDetails": true,
      "enableOcr": true,
      "enableLayout": true,
      "enableBarcode": false,
      "enableFormula": false,
      "disableContentFiltering": false
    },
    "fieldSchema": {
      "fields": {
        "VendorName": {
          "type": "string",
          "method": "extract",
          "description": "Vendor issuing the invoice"
        },
        "Items": {
          "type": "array",
          "method": "extract",
          "items": {
            "type": "object",
            "properties": {
              "Description": {
                "type": "string",
                "method": "extract",
                "description": "Description of the item"
              },
        

In [13]:
fe_response = client.begin_analyze(FIELD_EXTRACTION_ANALYZER_ID, "assets/docs/invoice-logic-apps-tutorial.pdf")
fe_result = client.poll_result(
        fe_response,
        timeout_seconds=60 * 60,
        polling_interval_seconds=1,
    )
json.dump(fe_result, sys.stdout, indent=2)

INFO:python.content_understanding_client:Analyzing file assets/docs/invoice-logic-apps-tutorial.pdf with analyzer: field-extraction-analyzer
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Request e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7 in progress ...
INFO:python.content_understanding_client:Re

{
  "id": "e26c8d2d-25cc-4cf0-af85-a8f94b1a16b7",
  "status": "Succeeded",
  "result": {
    "analyzerId": "field-extraction-analyzer",
    "apiVersion": "2024-12-01-preview",
    "createdAt": "2025-04-15T14:40:45Z",
    "contents": [
      {
        "markdown": "CONTOSO LTD.\n\n\n# INVOICE\n\nContoso Headquarters\n123 456th St\nNew York, NY, 10001\n\nINVOICE: INV-100\n\nINVOICE DATE: 11/15/2019\n\nDUE DATE: 12/15/2019\n\nCUSTOMER NAME: MICROSOFT CORPORATION\n\nSERVICE PERIOD: 10/14/2019 - 11/14/2019\n\nCUSTOMER ID: CID-12345\n\nMicrosoft Corp\n123 Other St,\nRedmond WA, 98052\n\nBILL TO:\n\nMicrosoft Finance\n\n123 Bill St,\n\nRedmond WA, 98052\n\nSHIP TO:\n\nMicrosoft Delivery\n\n123 Ship St,\n\nRedmond WA, 98052\n\nSERVICE ADDRESS:\nMicrosoft Services\n123 Service St,\nRedmond WA, 98052\n\n\n<table>\n<tr>\n<th>SALESPERSON</th>\n<th>P.O. NUMBER</th>\n<th>REQUISITIONER</th>\n<th>SHIPPED VIA</th>\n<th>F.O.B. POINT</th>\n<th>TERMS</th>\n</tr>\n<tr>\n<td></td>\n<td>PO-3333</td>\n<td></td

In [15]:
# create the folder if it does not exist
os.makedirs("results/docs", exist_ok=True)

# dump the result to a file
with open("results/docs/field_extraction_analyzer_invoice.json", "w") as f:
    json.dump(fe_result, f, indent=2)

## Clean-up analyzers

In [None]:
client.delete_analyzer(CONTENT_EXTRACTION_ANALYZER_ID)

In [None]:
client.delete_analyzer(FIELD_EXTRACTION_ANALYZER_ID)