/
sample_classify_document.py
128 lines (109 loc) · 5.04 KB
/
sample_classify_document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# coding: utf-8
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""
FILE: sample_classify_document.py
DESCRIPTION:
This sample demonstrates how to classify a document using a trained document classifier.
To learn how to build your custom classifier, see sample_build_classifier.py.
More details on building a classifier and labeling your data can be found here:
https://aka.ms/azsdk/formrecognizer/buildclassifiermodel
USAGE:
python sample_classify_document.py
Set the environment variables with your own values before running the sample:
1) AZURE_FORM_RECOGNIZER_ENDPOINT - the endpoint to your Form Recognizer resource
2) AZURE_FORM_RECOGNIZER_KEY - your Form Recognizer API key
3) CLASSIFIER_ID - the ID of your trained document classifier
-OR-
CLASSIFIER_CONTAINER_SAS_URL - The shared access signature (SAS) Url of your Azure Blob Storage container
with your training files. A document classifier will be built and used to run the sample.
"""
import os
def classify_document(classifier_id):
path_to_sample_documents = os.path.abspath(
os.path.join(
os.path.abspath(__file__),
"..",
"..",
"./sample_forms/forms/IRS-1040.pdf",
)
)
# [START classify_document]
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]
classifier_id = os.getenv("CLASSIFIER_ID", classifier_id)
document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
with open(path_to_sample_documents, "rb") as f:
poller = document_analysis_client.begin_classify_document(
classifier_id, document=f
)
result = poller.result()
print("----Classified documents----")
for doc in result.documents:
print(
f"Found document of type '{doc.doc_type or 'N/A'}' with a confidence of {doc.confidence} contained on "
f"the following pages: {[region.page_number for region in doc.bounding_regions]}"
)
# [END classify_document]
if __name__ == "__main__":
from azure.core.exceptions import HttpResponseError
try:
classifier_id = None
if os.getenv("CLASSIFIER_CONTAINER_SAS_URL") and not os.getenv("CLASSIFIER_ID"):
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import (
DocumentModelAdministrationClient,
ClassifierDocumentTypeDetails,
BlobSource,
)
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]
blob_container_sas_url = os.environ["CLASSIFIER_CONTAINER_SAS_URL"]
document_model_admin_client = DocumentModelAdministrationClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_model_admin_client.begin_build_document_classifier(
doc_types={
"IRS-1040-A": ClassifierDocumentTypeDetails(
source=BlobSource(
container_url=blob_container_sas_url,
prefix="IRS-1040-A/train",
)
),
"IRS-1040-D": ClassifierDocumentTypeDetails(
source=BlobSource(
container_url=blob_container_sas_url,
prefix="IRS-1040-D/train",
)
),
},
)
classifier = poller.result()
classifier_id = classifier.classifier_id
classify_document(classifier_id)
except HttpResponseError as error:
print(
"For more information about troubleshooting errors, see the following guide: "
"https://aka.ms/azsdk/python/formrecognizer/troubleshooting"
)
# Examples of how to check an HttpResponseError
# Check by error code:
if error.error is not None:
if error.error.code == "InvalidImage":
print(f"Received an invalid image error: {error.error}")
if error.error.code == "InvalidRequest":
print(f"Received an invalid request error: {error.error}")
# Raise the error again after printing it
raise
# If the inner error is None and then it is possible to check the message to get more information:
if "Invalid request".casefold() in error.message.casefold():
print(f"Uh-oh! Seems there was an invalid request: {error}")
# Raise the error again
raise