In [None]:
project_id = 'kmpg-case'
location = 'eu'  
processor_display_name = 'PDF_PROCESSOR_EU' 
processor_type = 'OCR_PROCESSOR'  
processor_version = 'rc' 
mime_type = 'application/pdf' 
processor_id = 'fde971a6ca78aafa'

In [None]:
"""
pip install --upgrade google-cloud-documentai
pip install fasttext
pip install pyPDF2
"""

In [None]:
# CREATE A Document AI PROCESSOR on Google Cloud FOR OCR SCANNING DOCUMENTS 
# CREATE THIS ONLY 1 TIME 

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import FailedPrecondition
from google.cloud import documentai

def create_processor(
    project_id: str, location: str, processor_display_name: str, processor_type: str
):
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )

    # Print the processor information
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor Type: {processor.type_}")

# ENABLE THE PROCESSOR in the cloud
# Used only once

def enable_processor(project_id: str, location: str, processor_id: str):
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location/processors/processor_id
    processor_name = client.processor_path(project_id, location, processor_id)
    request = documentai.EnableProcessorRequest(name=processor_name)

    # Make EnableProcessor request
    try:
        operation = client.enable_processor(request=request)

        # Print operation name
        print(operation.operation.name)
        # Wait for operation to complete
        operation.result()
    # Cannot enable a processor that is already enabled
    except FailedPrecondition as e:
        print(e.message)

#create_processor( project_id, location, processor_display_name, processor_type)
#enable_processor(project_id,location,processor_id)


In [None]:
# FASTTEXT LANGUAGE detection 
import fasttext as ft

# Load the pretrained model
ft_model = ft.load_model("lid.176.ftz")

def fasttext_language_predict(text, model = ft_model):

  text = text.replace('\n', " ")
  prediction = model.predict([text])

  return prediction


In [None]:
# PROCESS DOCUMENT AI CORE
# This function links Document AI from Google to the Google processor we just made

def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor version
    # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
    # You must create processors before running sample code.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)

    result = client.process_document(request=request)

    return result.document

In [None]:
# PROCESS DOCUMENT with OCR
 
from typing import Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

import pandas as pd
import re

def process_document_ocr(    project_id: str,    location: str,    processor_id: str,
    processor_version: str,    file_path: str,    document_id: str,    mime_type: str, output_path) -> None:

    # Online processing request to Document AI
    document = process_document(project_id, location, processor_id, processor_version, file_path, mime_type)
    text = document.text 
    
    # Clean the filename
    document_id_filename=re.sub('.pdf','',document_id)
    
    # Write document as text file
    with open(f'{output_path}/{document_id_filename}.txt', 'w', encoding="utf-8") as f:
        f.write(text) 
    # make new_row
    cols=[] 
    new_row=[document_id]
    cols.append('document_id')

    # detect if pdf has multiple parts (if original has > 10 pages)
    if '__' in document_id:
        position=document_id.rfind('__')
        sub_part=document_id[position:]
    else:
        sub_part=''
    
    new_row.append(sub_part)
    cols.append('sub_part')

    # Detect language of text
    language=fasttext_language_predict(text ,model=ft_model) 
    language=language[0][0][0][-2:] 

    # Add language to new_row
    new_row.append(language)
    cols.append('language')

    # Counter for paragraphs
    base=int(0)

    # Get all paragraphs from all pages
    for page in document.pages:
        new_row, cols, x = print_paragraphs(page.paragraphs, text, new_row, cols, base)
        base+=x

    # Write new_row as a dataframe  
    new_df = pd.DataFrame([new_row], columns=cols)

    # Return dataframe
    return new_df


def print_paragraphs(paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str, new_row: list, cols: list, base: int) -> None:
    if len(paragraphs)!=0:
        for x in range (len(paragraphs)):
            # Get the text of this paragraph
            paragraph_text=layout_to_text(paragraphs[x].layout, text)
            # Add text in new_row
            new_row.append(paragraph_text)
            # Create paragraph column for dataframe
            cols.append(f'p{base+x+1}')

        return new_row , cols, x+1
    else:
        return new_row , cols, 0

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in layout.text_anchor.text_segments:
        start_index = int(segment.start_index)
        end_index = int(segment.end_index)
        response += text[start_index:end_index]
    return response

In [None]:
# Function to count pages in .pdf
import PyPDF2

def get_nr_of_pages(file):
    readpdf = PyPDF2.PdfFileReader(file)
    totalpages = readpdf.numPages
    return totalpages

In [None]:
# This function checks if files are still to be processed
# These documents have already been processed into a .txt file
# Make a list of these documents to not feed them to Google Document AI
 
from os import listdir 
import os

def check_if_processed(file_to_check):
    
    # Work in this folder
    path='../new_processed_data/too_large/fixed/txt' 
    files_in_path=listdir(path)
    documents=[]

    # Make a list of all processed documents
    for file in files_in_path:    

        if file.find('__') != -1 and os.path.isdir(file)!=True:

            page=re.findall(r'__(.*)',file) 
            page=page[0][:-4]

            document_id=re.sub(page[0],'',file)
            document_id=re.sub(f'__{page}','',file)
            document_id=file[3:] 
            document_id=re.sub('__(.*)','',document_id)
            documents.append(document_id)

    # Check if file_to_check is in already processed documents list
    document_nr=file_to_check
    document_nr=file_to_check[3:]
    document_nr=re.sub('\.pdf','',document_nr)
    document_nr=re.sub('__(.*)','',document_nr) 
 
    if document_nr not in documents:
        return False
    else:
        return True
 

In [None]:
# This functions processes all .pdf files in input directory

from joblib import delayed, Parallel
import os
from os import listdir
import shutil

# Parallel function
def process_pdfs(output_path,files_in_path,n_jobs,n):
    x=int(n)
    df=pd.DataFrame()
    while x< len(files_in_path):

        file=files_in_path[x]
 
        # Check if file is .pdf
        if file[-4:]=='.pdf': 

            document_id=file
            check=check_if_processed(file) 
            if check==False:
                file_path=os.path.join(path,document_id)

                # Max page size on Google Cloud = 10
                if get_nr_of_pages(file_path)<=10:

                    print(f"[{x}/{len(files_in_path)}] Processing {document_id} ...   ",end='\r') 

                    # Run the process on the cloud
                    new_df=process_document_ocr(project_id,location,processor_id,processor_version,file_path,document_id,mime_type,output_path)

                    # Add new output to df
                    df = pd.concat([df, new_df], ignore_index = True)

                    print(f"Processing {document_id} : Done                                             ",end='\r') 
                
                else:    
                    # Copy pdf file with > 10 pages to error folder
                    src_path = file_path
                    dst_path = os.path.join(output_path,'too_large')
                    dst_file= os.path.join(dst_path,document_id)
                    shutil.copy(src_path, dst_file)
                    print(f'{document_id}: Too Large', end="\r")
        x+=n_jobs
    
    return df

In [None]:
# Execute this function to process files in input folder
# This function will save the processed .txt files in /processed_data
# And save a .csv file with the paragraphs and detected language

# NOTICE: afterwards, execute split_max_page_10.ipynb to split the too large .pdfs
# and run these through this script again

input_path='../new_processed_data/too_large/fixed' 
output_path='../new_processed_data/too_large/fixed/txt'
output_csv='../NL_doc_paragraphs_new_processed_too_large_fixed_extra.csv'

# Check if output directories are present
if not os.path.exists(output_path):
    os.mkdir(output_path)
if not os.path.exists(os.path.join(output_path,'too_large')):
    os.mkdir(os.path.join(output_path,'too_large'))

# Run script with n_jobs
n_jobs=10

# Work in this folder
path=input_path
files_in_path=listdir(path)

# Create parallel pool
delayed_funcs = [delayed(process_pdfs)(output_path,files_in_path,n_jobs,n) for n in range (0,n_jobs)]
parallel_pool = Parallel(n_jobs=n_jobs, require='sharedmem')
df_list=parallel_pool(delayed_funcs)

# Create result dataframe
df=pd.DataFrame()

# Add all the dataframes to the result dataframe
for x in range (n_jobs):
    df=pd.concat([df,df_list[x]])

# Write the result dataframe 
df.to_csv(output_csv, index=True) 
