In [1]:
from google.cloud import documentai_v1 as documentai
from google.oauth2 import service_account
from database import Database

db = Database()

# Base processor data
project_id = "genealogy-ocr-index"
location = "us"
processor_id = "4cb2ae40b145c48"

class ocr_engine:
    def __init__(self) -> None:

        # Create a client
        cred = service_account.Credentials.from_service_account_file("secrets/docai_credentials.json")
        self.__client = documentai.DocumentProcessorServiceClient(
            credentials=cred,
            client_options={"api_endpoint": f"{location}-documentai.googleapis.com"}
        )

        # Get the processor
        self.__processor = self.__client.processor_path(project=project_id, location=location, processor=processor_id)

    def process_documents(self):

        bucket_data = db.storage_get_images()
        
        # Get image URIs
        uris = [group[0] for group in bucket_data]

        # Get image download URLs
        urls = [group[1] for group in bucket_data]

        # Get extension of images to set correct mime type
        extension = uris[2][-3:]
        if "jpg" in extension:
            extension = "jpeg"
        elif "png" in extension:
            extension = "png"

        # Create a raw document object
        document = documentai.GcsDocument(gcs_uri=uris[2], mime_type=f"image/{extension}")

        # Create API request
        request = documentai.ProcessRequest(name=self.__processor, 
                                            gcs_document=document,
                                            process_options={"ocr_config": {
                                                "hints": {"language_hints": "es"}
                                                }
                                            })

        # Get result
        result = self.__client.process_document(request=request)
        return result.document

App initialized


In [2]:
# Send API call and get result
engine = ocr_engine()
result = engine.process_documents()
text = result.text

In [22]:
import re
def extract_year(original_text):
    match = re.search(r"\b[^\d]\d{4}[^\d]\b", original_text)
    idx = match.span()[0]
    end = match.span()[1]
    return original_text[idx:end].replace(" ", "")

In [24]:
doc_year = extract_year(text)

In [3]:
# Method to create a indexed map of tokens
def get_map(doc):
    map = {}
    for page in doc.pages:
        for token in page.tokens:
            idx = token.layout.text_anchor.text_segments[0].start_index
            end = token.layout.text_anchor.text_segments[0].end_index
            # The vertices go clockwise, starting on the upper left corner
            vertices = [vertix for vertix in token.layout.bounding_poly.normalized_vertices]
            map[idx] = (idx, end, vertices)
    return map

In [4]:
map = get_map(result)
print(map)

{0: (0, 4, [x: 0.754034042
y: 0.0381727144
, x: 0.77714783
y: 0.0375469327
, x: 0.77714783
y: 0.0506883599
, x: 0.754034042
y: 0.0513141416
]), 4: (4, 9, [x: 0.881378114
y: 0.0500625782
, x: 0.901003063
y: 0.0500625782
, x: 0.901003063
y: 0.0581977479
, x: 0.881378114
y: 0.0581977479
]), 9: (9, 12, [x: 0.602267742
y: 0.0732165202
, x: 0.61229831
y: 0.0732165202
, x: 0.61229831
y: 0.0851063803
, x: 0.602267742
y: 0.0851063803
]), 12: (12, 15, [x: 0.616659403
y: 0.0732165202
, x: 0.626689911
y: 0.0732165202
, x: 0.626689911
y: 0.0851063803
, x: 0.616659403
y: 0.0851063803
]), 15: (15, 21, [x: 0.631923258
y: 0.0732165202
, x: 0.656345427
y: 0.0732165202
, x: 0.656345427
y: 0.0851063803
, x: 0.631923258
y: 0.0851063803
]), 21: (21, 24, [x: 0.662014842
y: 0.0732165202
, x: 0.67204535
y: 0.0732165202
, x: 0.67204535
y: 0.0851063803
, x: 0.662014842
y: 0.0851063803
]), 24: (24, 27, [x: 0.677278697
y: 0.0732165202
, x: 0.687309206
y: 0.0732165202
, x: 0.687309206
y: 0.0851063803
, x: 0.6772786

In [5]:
# Load pre-trained and custom NER models for extraction
import spacy
from spacy.matcher import Matcher
nlp_es = spacy.load("es_core_news_lg")
nlp_custom = spacy.load("ner_model/model/model-best")

# Create a matcher to improve accuracy 
matcher = Matcher(nlp_es.vocab)

# Load list of names
import csv
names = []
last_names = []

with open('ner_model/data/processed/first_names_processed.csv', 'r', newline="") as file:
    reader = csv.reader(file, delimiter=",")
    for row in reader:
        names.append(row[0])

with open('ner_model/data/processed/last_names_processed.csv', 'r', newline='') as file:
    reader = csv.reader(file, delimiter=",")
    for row in reader:
        last_names.append(row[0])

# Add names and last names to entity ruler
ruler = nlp_custom.add_pipe("entity_ruler", after="ner", config={"overwrite_ents":True})

# Create patterns
patterns = []
match_patterns = []
for name in names:
    patterns.append({
        "label":"FIRST_NAME",
        "pattern": [{"LOWER":name.lower()}]
    })
    match_patterns.append([{"LOWER":name.lower()}])
    
# Add patterns to matcher
matcher.add("FIRST_NAME", match_patterns)

match_patterns = []
for name in last_names:
    patterns.append({
        "label":"LAST_NAME",
        "pattern": [{"LOWER":name.lower()}]
    })
    match_patterns.append([{"LOWER":name.lower()}])

# Add patterns to pipeline and matcher
matcher.add("LAST_NAME", match_patterns)
ruler.add_patterns(patterns)

In [6]:
# Iterate over blocks to gather starting and ending indexes
import unicodedata
def get_indexes(doc):
    block_idx = []
    
    for page in doc.pages:
        for block in page.blocks:
            start = block.layout.text_anchor.text_segments[0].start_index
            end = block.layout.text_anchor.text_segments[0].end_index
            block_idx.append((start, end))

    return block_idx

# Get blocks of text for processing
def get_text(idx, text):
    start = idx[0]
    end = idx[1]

    return text[start:end]

# Process text to remove new line charachters
def formatting(text_block):
    document_text = []
    i = 0
    while i < len(text_block):
        if text_block[i] == "-" and text_block[i+1] == "\n":
            document_text.append("")
            i += 2
        elif text_block[i] == "\n" and text_block[i-1] != "":
            document_text.append(" ")
            i += 1
        else:
            document_text.append(text_block[i])
            i += 1

    return "".join(document_text)

def remove_accent(text):
    normalized = []
    for char in text:
        if char != "ñ" and char != "Ñ" and not char.isascii():
            normal = unicodedata.normalize("NFD", char)
            for char in normal:
                if unicodedata.category(char) != "Mn":
                    normalized.append(char)
        else:
            normalized.append(char)

    return "".join(normalized)

In [7]:
# Get blocks
block_idx = get_indexes(result)

# Get processed blocks of text
text_blocks = []
for idx in block_idx:
    text_block = get_text(idx, text)
    #print(f"Block: {text_block}")
    text_blocks.append(formatting(text_block))

print("Blocks: ", text_blocks)

Blocks:  ['648 ', '-62en el citio de la Loma donde se divide para Chiscote, la que hubo en el reparto del resguardo de Guacamayas.', '334)--Esc .292-- El Cocuy 25 de julio de 1858- Eleuterio y Julian Patricio venden a Joaquín 1 aría Espinel el derecho y acción que a cada uno le corresponde en las tierras del Alizal, ycortade ra vereda do. laa Tapias jurisdicción de Chiscas, las que hubieron en el reparto del resguardo de índíjenas de dicho Chiscas.------355)--Esc #293-- El Cocuy 25 de julio de 1858 Juan Crisostomo Ro dríguez, vendor a Gavino Largo un pedazo de tierra en el sitio de Bocachica jurisdicción de la Capilla; el que hubo por compra ', 'Π ', 'Javier Solano y Gregorio Murillo ', 'o a su pr ', '536)--Eso . 294-- ElCocuy 25 de julio de 1858-- Con éste Número y en la fecha queda protocolado el documento que Antonio José de Herrera otorgó a Mateo Marón y Francisco Javier Leal vendiendoles una estancia en el páramo del Contento jurisdicción de Chiscas y Guicon ', '357)--Folio 512-- 

In [8]:
# Find names in text and get bounding boxes
def get_bounding_box(name, original_text, map):
    # Ensure the name is a string
    name = str(name)
    
    # Get initial and last index of text
    pattern = r"\b" + "".join(char + r"[\n-]*" if char != " " else r"[\s]*" for char in name ) + r"\b"
    match = re.search(pattern, original_text, re.IGNORECASE)
    if match:
        idx = match.span()[0]
        end = match.span()[1]+1 # Add one because Document AI considers whitespaces
        print("Indexes: ", idx, end, "String: ", repr(original_text[idx:end]))
    else:
        print("No matches")

    

In [9]:
## TODO: Need to find a way to be a bit more flexible with the choice 
## TODO: Create a function to push names and bounding box data to database
for block in text_blocks:
    # Extract names
    print("---------- New Block ----------")
    doc = nlp_es(block)
    for ent in doc.ents:
        if ent.label_ == "PER":
            print("## Default model results: ", ent, ent.label_)
            
            # Get bounding boxes for the entire entity
            bounds = get_bounding_box(ent.text, text, map)

            # Split entity to clasiffy in first name and last name
            tokens = ent.text.split(" ")
            

            # Gather first names and last names
            first_names = []
            last_names = []

            # Identify names in the entity
            for token in tokens:
                name = nlp_custom(remove_accent(token))
                matches = matcher(name)
                for name_ent in name.ents:
                    if len(matches) > 0 and name_ent.label_ == nlp_es.vocab[matches[0][0]].text:
                        # Add to corresponding name list
                        if name_ent.label_ == "FIRST_NAME":
                            first_names.append(name_ent.text)
                        elif name_ent.label_ == "LAST_NAME":
                            last_names.append(name_ent.text)
                        
                        # Print recognition data
                        print("     -- MATCHES MATCHER. Custom model results: ", 
                              name_ent, 
                              name_ent.label_, 
                              "/ Matcher results: ", 
                              name[matches[0][1]:matches[0][2]],
                              nlp_es.vocab[matches[0][0]].text)
                        
                    elif len(matches) == 0:
                        continue

                    else:
                        print("     -- DISCREPANCY WITH MATCHER. Custom model results: ", 
                              name_ent, name_ent.label_, 
                              name[matches[0][1]:matches[0][2]], 
                              nlp_es.vocab[matches[0][0]].text)
            
            # Process names separated by a space
            for i in range(len(tokens)):
                if i+1 < len(tokens):
                    combined = nlp_custom(tokens[i]+tokens[i+1])
                    match = matcher(combined)
                    if match and combined.ents[0].label_ == nlp_es.vocab[match[0][0]].text:
                        # Add to corresponding name list
                        if combined.ents[0].label_ == "FIRST_NAME":
                            first_names.append(combined.ents[0].text)
                        elif combined.ents[0].label_ == "LAST_NAME":
                            last_names.append(combined.ents[0].text)
            
            # Add first and last names to name list
            if first_names:
                print("     -- First Names: ", " ".join(first_names))
            if last_names:
                print("     -- Last Names: ", " ".join(last_names))


---------- New Block ----------
---------- New Block ----------
## Default model results:  Guacamayas PER
Indexes:  106 117 String:  'Guacamayas.'
---------- New Block ----------
## Default model results:  Eleuterio PER
Indexes:  166 176 String:  'Eleuterio '
     -- MATCHES MATCHER. Custom model results:  Eleuterio FIRST_NAME / Matcher results:  Eleuterio FIRST_NAME
     -- First Names:  Eleuterio
## Default model results:  Julian Patricio PER
Indexes:  178 196 String:  'Ju-\nlian Patricio '
     -- MATCHES MATCHER. Custom model results:  Julian FIRST_NAME / Matcher results:  Julian FIRST_NAME
     -- MATCHES MATCHER. Custom model results:  Patricio FIRST_NAME / Matcher results:  Patricio FIRST_NAME
     -- First Names:  Julian Patricio
## Default model results:  Joaquín PER
Indexes:  205 213 String:  'Joaquín '
     -- MATCHES MATCHER. Custom model results:  Joaquin FIRST_NAME / Matcher results:  Joaquin FIRST_NAME
     -- First Names:  Joaquin
## Default model results:  Alizal PER
I