# Notebook to experiment with Document AI service on GCP

In [14]:
import os
import dotenv
import yaml
import pathlib

from typing import Sequence
from PyPDF2 import PdfReader, PdfWriter

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

In [14]:
# ! gcloud auth login
# ! gcloud auth application-default login

## Get Env variables and load config

In [3]:
dotenv.load_dotenv(dotenv.find_dotenv(".env_dev"))
root = os.environ.get("ROOT")

In [4]:
with open("../ai_assistant/config.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
cfg

{'project': 'deft-weaver-396616',
 'llm': {'model': 'gemini-1.0-pro-002', 'location': 'us-central1'},
 'documentAI': {'processor': 'a39ef57dc4bf59c2',
  'region': 'eu',
  'sync_max_pages': 15}}

In [5]:
project_id = cfg.get("project")
processor_id = cfg.get("documentAI").get("processor") # Format is "us" or "eu"
location = cfg.get("documentAI").get("region") # Create processor before running sample
max_pages = cfg.get("documentAI").get("sync_max_pages")
file_path = f"{root}/downloads/2404.19756v2.pdf"
mime_type = "application/pdf"

## Split Pdf file into sub-pdfs of max 15 pages

In [6]:
def split_pdfs(path, pageLimit=15):
    folder_path = pathlib.Path(
        '/'.join(
            path.split("/")[:-1]
        )
    )
    file_name = '.'.join(
        path.split("/")[-1]\
            .split(".")[:-1]
    )
    all_file_paths = []
    pdf = PdfReader(path)
    all_pages = pdf.pages
    
    page_breaks = [(pageLimit*i)+1 for i in range(1, (len(all_pages)//pageLimit)+1)]
    if len(pdf.pages) % pageLimit != 0:
        page_breaks.append(len(all_pages)+1)
    page_counter = 1
    page_start = 1

    for i, pg_start in enumerate(page_breaks, 1):
        fname = f"{file_name}_{i}.pdf"
        writer = PdfWriter()
        for j in range(page_start, pg_start):
            writer.add_page(all_pages[j-1])
            page_counter += 1
        print(f"writing from {page_start} to {page_counter-1}")
        with open(folder_path/fname, "wb") as outfile:
            writer.write(outfile)
        all_file_paths.append(folder_path/fname)
        page_start = pg_start

    return all_file_paths
    

# all_files = split_pdfs(file_path)
# print(all_files)

## Parse PDF with Document AI OCR

In [7]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    name = client.processor_path(project_id, location, processor_id)

    if len(PdfReader(file_path).pages) > max_pages:
        all_files = split_pdfs(file_path, pageLimit=max_pages)
    else:
        all_files = [pathlib.Path(file_path)]

    for f in all_files:
        fname = pathlib.PurePath(f).parts[-1]
        print(f"Processing {fname}....")
        # Read the file into memory
        with open(f, "rb") as image:
            image_content = image.read()

        # Load binary data
        raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

        # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
        # Optional: Additional configurations for processing.
        # process_options = documentai.ProcessOptions(
        #     # Process only specific pages
        #     individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
        #         pages=[1]
        #     )
        # )

        # Configure the process request
        request = documentai.ProcessRequest(
            name=name,
            raw_document=raw_document
        )

        result = client.process_document(request=request)

        # For a full list of `Document` object attributes, reference this page:
        # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
        document = result.document

        # Read the text recognition output from the processor
        # print("The document contains the following text:")
        # print(document.text)
        # break
        return document


In [8]:
doc = process_document_sample(project_id, location, processor_id, file_path, mime_type)

writing from 1 to 15
writing from 16 to 30
writing from 31 to 45
writing from 46 to 48
Processing 2404.19756v2_1.pdf....


## Post-Process payload from Document AI

In [15]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

In [16]:
def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:
    print(f"    {len(blocks)} blocks detected:")
    first_block_text = layout_to_text(blocks[0].layout, text)
    print(f"        First text block: {repr(first_block_text)}")
    last_block_text = layout_to_text(blocks[-1].layout, text)
    print(f"        Last text block: {repr(last_block_text)}")

In [17]:
def print_paragraphs(
    paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str
) -> None:
    print(f"    {len(paragraphs)} paragraphs detected:")
    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
    print(f"        First paragraph text: {repr(first_paragraph_text)}")
    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
    print(f"        Last paragraph text: {repr(last_paragraph_text)}")


In [19]:
for page in doc.pages:
    print_blocks(page.blocks, doc.text)
    print_paragraphs(page.paragraphs, doc.text)
    break

    60 blocks detected:
        First text block: 'arXiv:2404.19756v2 [cs.LG] 2 May 2024\n'
        Last text block: 'Preprint. Under review.\n'
    64 paragraphs detected:
        First paragraph text: 'arXiv:2404.19756v2 [cs.LG] 2 May 2024\n'
        Last paragraph text: 'Preprint. Under review.\n'


In [23]:
page1_para = iter([paragraph.layout for paragraph in doc.pages[0].paragraphs])

In [88]:
layout_to_text(next(page1_para), doc.text)

StopIteration: 