In [None]:
project_id = 'kmpg-case'
location = 'eu'  
processor_display_name = 'PDF_PROCESSOR_EU' 
processor_type = 'OCR_PROCESSOR'  
processor_version = 'rc' 
mime_type = 'application/pdf' 
processor_id = 'fde971a6ca78aafa'

In [None]:
# CREATE A PROCESSOR on Google Cloud FOR OCR SCANNING DOCUMENTS 
# CREATE THIS ONLY 1 TIME 

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import FailedPrecondition
from google.cloud import documentai

def create_processor(
    project_id: str, location: str, processor_display_name: str, processor_type: str
):
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )

    # Print the processor information
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor Type: {processor.type_}")

# ENABLE THE PROCESSOR in the cloud
# Used only once

def enable_processor(project_id: str, location: str, processor_id: str):
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location/processors/processor_id
    processor_name = client.processor_path(project_id, location, processor_id)
    request = documentai.EnableProcessorRequest(name=processor_name)

    # Make EnableProcessor request
    try:
        operation = client.enable_processor(request=request)

        # Print operation name
        print(operation.operation.name)
        # Wait for operation to complete
        operation.result()
    # Cannot enable a processor that is already enabled
    except FailedPrecondition as e:
        print(e.message)

#create_processor( project_id, location, processor_display_name, processor_type)
#enable_processor(project_id,location,processor_id)


In [None]:
# FASTTEXT LANGUAGE detection 
import fasttext as ft

# Load the pretrained model
ft_model = ft.load_model("lid.176.ftz")

def fasttext_language_predict(text, model = ft_model):

  text = text.replace('\n', " ")
  prediction = model.predict([text])

  return prediction


In [None]:
# PROCESS DOCUMENT CORE

def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor version
    # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
    # You must create processors before running sample code.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)

    result = client.process_document(request=request)

    return result.document

In [None]:
# PROCESS DOCUMENT with OCR
 
from typing import Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

import pandas as pd

def process_document_ocr(    project_id: str,    location: str,    processor_id: str,
    processor_version: str,    file_path: str,    document_id: str,    mime_type: str,) -> None:

    # Online processing request to Document AI
    document = process_document(project_id, location, processor_id, processor_version, file_path, mime_type)
    text = document.text 
    
    # Write document as text file
    with open(f'processed_data/{document_id}.txt', 'w', encoding="utf-8") as f:
        f.write(text) 
    # make new_row
    cols=[] 
    new_row=[document_id]
    cols.append('document_id')

    # Detect language of text
    language=fasttext_language_predict(text ,model=ft_model) 
    language=language[0][0][0][-2:] 

    # Add language to new_row
    new_row.append(language)
    cols.append('language')

    # Counter for paragraphs
    base=int(0)

    # Get all paragraphs from all pages
    for page in document.pages:
        new_row, cols, x = print_paragraphs(page.paragraphs, text, new_row, cols, base)
        base+=x

    # Write new_now as a dataframe  
    new_df = pd.DataFrame([new_row], columns=cols)

    # Return dataframe
    return new_df


def print_paragraphs(paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str, new_row: list, cols: list, base: int) -> None:
    for x in range (len(paragraphs)):
        paragraph_text=layout_to_text(paragraphs[x].layout, text)
        new_row.append(paragraph_text)
        cols.append(f'p{base+x+1}')
    return new_row , cols, x+1

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in layout.text_anchor.text_segments:
        start_index = int(segment.start_index)
        end_index = int(segment.end_index)
        response += text[start_index:end_index]
    return response

In [None]:
# Function to count pages in .pdf
import PyPDF2

def get_nr_of_pages(file):
    readpdf = PyPDF2.PdfFileReader(file)
    totalpages = readpdf.numPages
    return totalpages

In [None]:
import alive_progress
import os
from os import listdir
import shutil

# Create new dataframe with paragraphs from all documents
df=pd.DataFrame()

path='Bilingual/Nederlands/'
counter=int(0)    
files_in_path=listdir(path)

for file in files_in_path:     
    if file[-4:]=='.pdf':
        counter+=1
        document_id=file
        file_path=os.path.join(path,document_id)

        # Max page size on Google Cloud = 10
        if get_nr_of_pages(file_path)<=10:
            print(f"[{counter}/{len(files_in_path)}] Processing {document_id} ...   ",end='\r') 
            new_df=process_document_ocr(project_id,location,processor_id,processor_version,file_path,document_id,mime_type)
            df = pd.concat([df, new_df], ignore_index = True)
            print(f"Processing {document_id} : Done",end='\r') 
            df.to_csv('FR_doc_paragraphs.csv', index=True) 
        
        else:    
            # Copy remaining file 
            src_path = file_path
            dst_path = os.path.join('processed_data','error')
            dst_file= os.path.join(dst_path,document_id)
            shutil.copy(src_path, dst_file)
            print(f'{document_id}: Too Large', end="\r")

