In [None]:
import os
import yaml
import dotenv
import pathlib

from PyPDF2 import PdfReader, PdfWriter

from google.oauth2 import service_account
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

In [None]:
with open("../ai_assistant/config.yaml") as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
cfg

In [None]:
dotenv.load_dotenv(dotenv.find_dotenv(".env_dev"))
api_key = os.environ.get("GOOGLE_API_KEY")
credentials = service_account.Credentials.from_service_account_file(api_key)

In [None]:
project_id = cfg.get("project")
processor_id = cfg.get("documentAI").get("processor") # Format is "us" or "eu"
location = cfg.get("documentAI").get("region") # Create processor before running sample
max_pages = cfg.get("documentAI").get("sync_max_pages")
file_path = "/home/anand/Developer/Tutorials/terraform_tutorial/downloads/2404.19756v2.pdf"
mime_type = "application/pdf"

In [None]:
def split_pdfs(path, pageLimit=15):
    folder_path = pathlib.Path(
        '/'.join(
            path.split("/")[:-1]
        )
    )
    file_name = '.'.join(
        path.split("/")[-1]\
            .split(".")[:-1]
    )
    all_file_paths = []
    pdf = PdfReader(path)
    all_pages = pdf.pages
    
    page_breaks = [(pageLimit*i)+1 for i in range(1, (len(all_pages)//pageLimit)+1)]
    if len(pdf.pages) % pageLimit != 0:
        page_breaks.append(len(all_pages)+1)
    page_counter = 1
    page_start = 1

    for i, pg_start in enumerate(page_breaks, 1):
        fname = f"{file_name}_{i}.pdf"
        writer = PdfWriter()
        for j in range(page_start, pg_start):
            writer.add_page(all_pages[j-1])
            page_counter += 1
        print(f"writing from {page_start} to {page_counter-1}")
        with open(folder_path/fname, "wb") as outfile:
            writer.write(outfile)
        all_file_paths.append(folder_path/fname)
        page_start = pg_start

    return all_file_paths
    

all_files = split_pdfs(file_path)
print(all_files)

In [None]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    name = client.processor_path(project_id, location, processor_id)

    if len(PdfReader(file_path).pages) > max_pages:
        all_files = split_pdfs(file_path, pageLimit=max_pages)
    else:
        all_files = [pathlib.Path(file_path)]

    for f in all_files:
        fname = pathlib.PurePath(f).parts[-1]
        print(f"Processing {fname}....")
        # Read the file into memory
        with open(f, "rb") as image:
            image_content = image.read()

        # Load binary data
        raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

        # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
        # Optional: Additional configurations for processing.
        # process_options = documentai.ProcessOptions(
        #     # Process only specific pages
        #     individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
        #         pages=[1]
        #     )
        # )

        # Configure the process request
        request = documentai.ProcessRequest(
            name=name,
            raw_document=raw_document
        )

        result = client.process_document(request=request)

        # For a full list of `Document` object attributes, reference this page:
        # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
        document = result.document

        # Read the text recognition output from the processor
        print("The document contains the following text:")
        print(document.text)
        break


In [None]:
process_document_sample(project_id, location, processor_id, file_path, mime_type)