# 1. Imports

In [None]:
import os
from typing import Iterator, MutableSequence, Optional, Sequence, Tuple
import google.cloud.documentai_v1 as docai
from tabulate import tabulate

In [2]:
from dotenv import load_dotenv

# 2. Setting Project ID

Note You need to authenticate & make sure same PROJECT ID is used.[Google Cloud Docs](https://codelabs.developers.google.com/codelabs/cloud-documentai-manage-processors-python#1)

In [3]:
PROJECT_ID = os.getenv("PROJECT_ID", "")
API_LOCATION = os.getenv("API_LOCATION", "")

In [4]:
# Test processors
document_ocr_display_name = "document-ocr"
form_parser_display_name = "form-parser"

test_processor_display_names_and_types = (
    (document_ocr_display_name, "OCR_PROCESSOR"),
    (form_parser_display_name, "FORM_PARSER_PROCESSOR"),
)


In [5]:
def get_client() -> docai.DocumentProcessorServiceClient:
    client_options = {"api_endpoint": f"{API_LOCATION}-documentai.googleapis.com"}
    return docai.DocumentProcessorServiceClient(client_options=client_options)

def get_parent(client: docai.DocumentProcessorServiceClient) -> str:
    return client.common_location_path(PROJECT_ID, API_LOCATION)

def get_client_and_parent() -> Tuple[docai.DocumentProcessorServiceClient, str]:
    client = get_client()
    parent = get_parent(client)
    return client, parent


In [6]:
def fetch_processor_types() -> MutableSequence[docai.ProcessorType]:
    client, parent = get_client_and_parent()
    response = client.fetch_processor_types(parent=parent)

    return response.processor_types


In [7]:
def print_processor_types(processor_types: Sequence[docai.ProcessorType]):
    def sort_key(pt):
        return (not pt.allow_creation, pt.category, pt.type_)

    sorted_processor_types = sorted(processor_types, key=sort_key)
    data = processor_type_tabular_data(sorted_processor_types)
    headers = next(data)
    colalign = next(data)

    print(tabulate(data, headers, tablefmt="pretty", colalign=colalign))
    print(f"→ Processor types: {len(sorted_processor_types)}")

def processor_type_tabular_data(
    processor_types: Sequence[docai.ProcessorType],
) -> Iterator[Tuple[str, str, str, str]]:
    def locations(pt):
        return ", ".join(sorted(loc.location_id for loc in pt.available_locations))

    yield ("type", "category", "allow_creation", "locations")
    yield ("left", "left", "left", "left")
    if not processor_types:
        yield ("-", "-", "-", "-")
        return
    for pt in processor_types:
        yield (pt.type_, pt.category, f"{pt.allow_creation}", locations(pt))


In [8]:
processor_types = fetch_processor_types()
print_processor_types(processor_types)


+--------------------------------------+-------------+----------------+---------------------------------------------------------------------------------------------------------------------------+
| type                                 | category    | allow_creation | locations                                                                                                                 |
+--------------------------------------+-------------+----------------+---------------------------------------------------------------------------------------------------------------------------+
| CUSTOM_CLASSIFICATION_PROCESSOR      | CUSTOM      | True           | asia-south1, asia-southeast1, australia-southeast1, eu, europe-west2, europe-west3, northamerica-northeast1, us           |
| CUSTOM_EXTRACTION_PROCESSOR          | CUSTOM      | True           | asia-south1, asia-southeast1, australia-southeast1, eu, europe-west2, europe-west3, northamerica-northeast1, us, us-east7 |
| CUSTOM_SPLITTING_P

In [16]:
def create_processor(display_name: str, type: str) -> docai.Processor:
    client, parent = get_client_and_parent()
    processor = docai.Processor(display_name=display_name, type_=type)

    return client.create_processor(parent=parent, processor=processor)

In [17]:
separator = "=" * 80
for display_name, type in test_processor_display_names_and_types:
    print(separator)
    print(f"Creating {display_name} ({type})...")
    try:
        create_processor(display_name, type)
    except Exception as err:
        print(err)
print(separator)
print("Done")


Creating document-ocr (OCR_PROCESSOR)...
Creating form-parser (FORM_PARSER_PROCESSOR)...
Done


In [9]:
def list_processors() -> MutableSequence[docai.Processor]:
    client, parent = get_client_and_parent()
    response = client.list_processors(parent=parent)

    return list(response.processors)

def print_processors(processors: Optional[Sequence[docai.Processor]] = None):
    def sort_key(processor):
        return processor.display_name

    if processors is None:
        processors = list_processors()
    sorted_processors = sorted(processors, key=sort_key)
    data = processor_tabular_data(sorted_processors)
    headers = next(data)
    colalign = next(data)

    print(tabulate(data, headers, tablefmt="pretty", colalign=colalign))
    print(f"→ Processors: {len(sorted_processors)}")

def processor_tabular_data(
    processors: Sequence[docai.Processor],
) -> Iterator[Tuple[str, str, str]]:
    yield ("display_name", "type", "state")
    yield ("left", "left", "left")
    if not processors:
        yield ("-", "-", "-")
        return
    for processor in processors:
        yield (processor.display_name, processor.type_, processor.state.name)
        


In [10]:
processors = list_processors()
print_processors(processors)


+----------------------+-----------------------+---------+
| display_name         | type                  | state   |
+----------------------+-----------------------+---------+
| document-ocr         | OCR_PROCESSOR         | ENABLED |
| form-parser          | FORM_PARSER_PROCESSOR | ENABLED |
| indic_test_processor | OCR_PROCESSOR         | ENABLED |
+----------------------+-----------------------+---------+
→ Processors: 3


In [11]:
def get_processor(
    display_name: str,
    processors: Optional[Sequence[docai.Processor]] = None,
) -> Optional[docai.Processor]:
    if processors is None:
        processors = list_processors()
    for processor in processors:
        if processor.display_name == display_name:
            return processor
    return None
    


In [12]:
processor = get_processor("indic_test_processor", processors)

assert processor is not None
print(processor)

name: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7"
type_: "OCR_PROCESSOR"
display_name: "indic_test_processor"
state: ENABLED
process_endpoint: "https://us-documentai.googleapis.com/v1/projects/781576483146/locations/us/processors/5945bfe7932ca5b7:process"
create_time {
  seconds: 1751212231
  nanos: 698775000
}
default_processor_version: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7/processorVersions/pretrained-ocr-v2.1-2024-08-07"
processor_version_aliases {
  alias: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7/processorVersions/pretrained"
  processor_version: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7/processorVersions/pretrained-ocr-v1.0-2020-09-23"
}
processor_version_aliases {
  alias: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7/processorVersions/pretrained-next"
  processor_version: "projects/781576483146/locations/us/processors/5945bfe7932ca5b7/processorVersions/pretrained-ocr-v1.

In [13]:
def process_file(
    processor: docai.Processor,
    file_path: str,
    mime_type: str,
) -> docai.Document:
    client = get_client()
    with open(file_path, "rb") as document_file:
        document_content = document_file.read()
    document = docai.RawDocument(content=document_content, mime_type=mime_type)
    request = docai.ProcessRequest(raw_document=document, name=processor.name)

    response = client.process_document(request)

    return response.document

In [14]:
processor_obj = get_processor("indic_test_processor")
fp = "../../pdf_samples/Winnie_the_Pooh_3_Pages.pdf"
mime_type = "application/pdf"
parsed_doc = process_file(processor_obj,fp, mime_type)

In [15]:
print(" ".join(parsed_doc.text.split("\n")))

CHAPTER I IN WHICH We Are Introduced to Winnie-the-Pooh and Some Bees, and the Stories Begin HERE LERE is Edward Bear, coming downstairs now, bump, bump, bump, on the back of his head, behind Christopher Robin. It is, as far as he knows, the only way of coming downstairs, but sometimes he feels that there really is another way, if only he could stop bumping for a moment and think of it. And then he feels that perhaps there isn't. Anyhow, here he is at the bottom, and ready to be introduced to you. Winnie-the-Pooh. When I first heard his name, I said, just as you are going to say, "But I thought he was a boy?" "So did I," said Christopher Robin. "Then you can't call him Winnie?" 6. "I don't." "But you said--" "He's Winnje-ther-Pooh. Don't you know what 'ther' means?" I Digitized by Google 2 WINNIE-THE-POOH "Ah, yes, now I do," I said quickly; and I hope you do too, because it is all the explanation you are going to get. Sometimes Winnie-the-Pooh likes a game of some sort when he comes d

In [16]:
parsed_doc.DocumentLayout

google.cloud.documentai_v1.types.document.Document.DocumentLayout

In [17]:
def list_processor_versions(
    processor: docai.Processor,
) -> MutableSequence[docai.ProcessorVersion]:
    client = get_client()
    response = client.list_processor_versions(parent=processor.name)

    return list(response)


def get_sorted_processor_versions(
    processor: docai.Processor,
) -> MutableSequence[docai.ProcessorVersion]:
    def sort_key(processor_version: docai.ProcessorVersion):
        return processor_version.name

    versions = list_processor_versions(processor)

    return sorted(versions, key=sort_key)


def print_processor_versions(processor: docai.Processor):
    versions = get_sorted_processor_versions(processor)
    default_version_name = processor.default_processor_version
    data = processor_versions_tabular_data(versions, default_version_name)
    headers = next(data)
    colalign = next(data)

    print(tabulate(data, headers, tablefmt="pretty", colalign=colalign))
    print(f"→ Processor versions: {len(versions)}")


def processor_versions_tabular_data(
    versions: Sequence[docai.ProcessorVersion],
    default_version_name: str,
) -> Iterator[Tuple[str, str, str]]:
    yield ("version", "display name", "default")
    yield ("left", "left", "left")
    if not versions:
        yield ("-", "-", "-")
        return
    for version in versions:
        mapping = docai.DocumentProcessorServiceClient.parse_processor_version_path(
            version.name
        )
        processor_version = mapping["processor_version"]
        is_default = "Y" if version.name == default_version_name else ""
        yield (processor_version, version.display_name, is_default)
        


In [18]:
processor = get_processor("indic_test_processor")
print_processor_versions(processor)


+--------------------------------+--------------------------+---------+
| version                        | display name             | default |
+--------------------------------+--------------------------+---------+
| pretrained-ocr-v1.0-2020-09-23 | Google Stable            |         |
| pretrained-ocr-v1.1-2022-09-12 | Google Release Candidate |         |
| pretrained-ocr-v1.2-2022-11-10 | Google Release Candidate |         |
| pretrained-ocr-v2.0-2023-06-02 | Google Stable            |         |
| pretrained-ocr-v2.1-2024-08-07 | Google Release Candidate | Y       |
+--------------------------------+--------------------------+---------+
→ Processor versions: 5


In [19]:
def set_default_processor_version(processor: docai.Processor, version_name: str):
    client = get_client()
    request = docai.SetDefaultProcessorVersionRequest(
        processor=processor.name,
        default_processor_version=version_name,
    )

    operation = client.set_default_processor_version(request)
    operation.result()

In [20]:
processor = get_processor("indic_test_processor")
versions = get_sorted_processor_versions(processor)

new_version = versions[-1]  # Latest version
set_default_processor_version(processor, new_version.name)


In [21]:
processor = get_processor("indic_test_processor")
print_processor_versions(processor)

+--------------------------------+--------------------------+---------+
| version                        | display name             | default |
+--------------------------------+--------------------------+---------+
| pretrained-ocr-v1.0-2020-09-23 | Google Stable            |         |
| pretrained-ocr-v1.1-2022-09-12 | Google Release Candidate |         |
| pretrained-ocr-v1.2-2022-11-10 | Google Release Candidate |         |
| pretrained-ocr-v2.0-2023-06-02 | Google Stable            |         |
| pretrained-ocr-v2.1-2024-08-07 | Google Release Candidate | Y       |
+--------------------------------+--------------------------+---------+
→ Processor versions: 5


In [22]:
600 * 82

49200