In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
endpoint = os.environ["FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["FORM_RECOGNIZER_KEY"]

In [3]:
model_id = "20231109_v1"
path_to_sample_documents = "./2311.05490_TEST.pdf"

In [4]:
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [5]:
with open(path_to_sample_documents, "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        model_id, document=f
    )
result = poller.result()

In [6]:
for idx, document in enumerate(result.documents):
    print("--------Analyzing document #{}--------".format(idx + 1))
    print("Document has type {}".format(document.doc_type))
    print("Document has confidence {}".format(document.confidence))
    print("Document was analyzed by model with ID {}".format(result.model_id))
    for name, field in document.fields.items():
        field_value = field.value if field.value else field.content
        print("......found field of type '{}' with value '{}' and with confidence {}".format(field.value_type, field_value, field.confidence))

--------Analyzing document #1--------
Document has type 20231109_v1
Document has confidence 0.021
Document was analyzed by model with ID 20231109_v1
......found field of type 'string' with value 'General Policies, Subgoal Structure, and Planning Width' and with confidence 0.412
......found field of type 'string' with value 'Blai Bonet BONETBLAI@GMAIL.COM Universitat Pompeu Fabra, Spain Hector Geffner HECTOR.GEFFNER@ML.RWTH-AACHEN.DE RWTH Aachen University, Germany Linköping University, Sweden' and with confidence 0.379
......found field of type 'string' with value 'Abstract It has been observed that many classical planning domains with atomic goals can be solved by means of a simple polynomial exploration procedure, called IW, that runs in time exponential in the problem width, which in these cases is bounded and small. Yet, while the notion of width has become part of state-of-the-art planning algorithms such as BFWS, there is no good explanation for why so many benchmark domains have

In [7]:
for page in result.pages:
    print("\nLines found on page {}".format(page.page_number))
    for line in page.lines:
        print("...Line '{}'".format(line.content.encode('utf-8')))
    for word in page.words:
        print(
            "...Word '{}' has a confidence of {}".format(
                word.content.encode('utf-8'), word.confidence
            )
        )
    for selection_mark in page.selection_marks:
        print(
            "...Selection mark is '{}' and has a confidence of {}".format(
                selection_mark.state, selection_mark.confidence
            )
        )


Lines found on page 1
...Line 'b'General Policies, Subgoal Structure, and Planning Width''
...Line 'b'Blai Bonet''
...Line 'b'Universitat Pompeu Fabra, Spain''
...Line 'b'Hector Geffner''
...Line 'b'RWTH Aachen University, Germany''
...Line 'b'Link\xc3\xb6ping University, Sweden''
...Line 'b'BONETBLAI@GMAIL.COM''
...Line 'b'HECTOR.GEFFNER@ML.RWTH-AACHEN.DE''
...Line 'b'Abstract''
...Line 'b'It has been observed that many classical planning domains with atomic goals can be''
...Line 'b'solved by means of a simple polynomial exploration procedure, called IW, that runs in''
...Line 'b'time exponential in the problem width, which in these cases is bounded and small. Yet,''
...Line 'b'while the notion of width has become part of state-of-the-art planning algorithms such as''
...Line 'b'BFWS, there is no good explanation for why so many benchmark domains have bounded''
...Line 'b'width when atomic goals are considered. In this work, we address this question by relating''
...Line 'b'bounded 

In [8]:
for i, table in enumerate(result.tables):
    print("\nTable {} can be found on page:".format(i + 1))
    for region in table.bounding_regions:
        print("...{}".format(i + 1, region.page_number))
    for cell in table.cells:
        print(
            "...Cell[{}][{}] has content '{}'".format(
                cell.row_index, cell.column_index, cell.content.encode('utf-8')
            )
        )
print("-----------------------------------")

-----------------------------------
