In [1]:
from google.cloud import documentai_v1 as documentai
from google.oauth2 import service_account
from database import Database

db = Database()

# Base processor data
project_id = "genealogy-ocr-index"
location = "us"
processor_id = "4cb2ae40b145c48"

class ocr_engine:
    def __init__(self) -> None:

        # Create a client
        cred = service_account.Credentials.from_service_account_file("secrets/docai_credentials.json")
        self.__client = documentai.DocumentProcessorServiceClient(
            credentials=cred,
            client_options={"api_endpoint": f"{location}-documentai.googleapis.com"}
        )

        # Get the processor
        self.__processor = self.__client.processor_path(project=project_id, location=location, processor=processor_id)

    def process_documents(self):

        # Get image from URL
        uris = db.storage_get_images()

        # Get extension of images to set correct mime type
        extension = uris[2][-3:]
        if "jpg" in extension:
            extension = "jpeg"
        elif "png" in extension:
            extension = "png"

        # Create a raw document object
        document = documentai.GcsDocument(gcs_uri=uris[2], mime_type=f"image/{extension}")

        # Create API request
        request = documentai.ProcessRequest(name=self.__processor, 
                                            gcs_document=document,
                                            process_options={"ocr_config": {
                                                "hints": {"language_hints": "es"}
                                                }
                                            })

        # Get result
        result = self.__client.process_document(request=request)
        return result.document

App initialized


In [94]:
# Send API call and get result
engine = ocr_engine()
result = engine.process_documents()
text = result.text

In [97]:
# Iterate over blocks to gather starting and ending indexes
def get_indexes(doc):
    block_idx = []
    
    for page in doc.pages:
        for block in page.blocks:
            start = block.layout.text_anchor.text_segments[0].start_index
            end = block.layout.text_anchor.text_segments[0].end_index
            block_idx.append((start, end))

    return block_idx

# Get blocks of text for processing
def get_text(idx, text):
    start = idx[0]
    end = idx[1]

    return text[start:end]

# Process text to remove new line charachters
def remove_newline(text_block):
    document_text = ""
    i = 0
    while i < len(text_block):
        if text_block[i] == "-" and text_block[i+1] == "\n":
            document_text += ""
            i += 2
        elif text_block[i] == "\n" and text_block[i-1] != "":
            document_text += " "
            i += 1
        else:
            document_text += text_block[i]
            i += 1

    return document_text

In [98]:
# Get blocks
block_idx = get_indexes(result)

# Get processed blocks of text
text_blocks = []
for idx in block_idx:
    text_block = get_text(idx, text)
    print(f"Block: {text_block}")
    text_blocks.append(remove_newline(text_block))

print(text_blocks)

Block: 648

Block: -62-
en el citio de la Loma donde se divide para Chiscote, la que hubo
en el reparto del resguardo de Guacamayas.-

Block: 334)--Esc .292-- El Cocuy 25 de julio de 1858- Eleuterio y Ju-
lian Patricio venden a Joaquín 1 aría Espinel el derecho y acción
que a cada uno le corresponde en las tierras del Alizal, ycortade
ra vereda do. laa Tapias jurisdicción de Chiscas, las que hubieron
en el reparto del resguardo de índíjenas de dicho Chiscas.-------
355)--Esc #293-- El Cocuy 25 de julio de 1858 Juan Crisostomo Ro
dríguez, vendor a Gavino Largo un pedazo de tierra en el sitio de
Bocachica jurisdicción de la Capilla; el que hubo por compra

Block: Π

Block: Javier Solano y Gregorio Murillo

Block: o a su pr

Block: 536)--Eso . 294-- ElCocuy 25 de julio de 1858-- Con éste Número
y en la fecha queda protocolado el documento que Antonio José de
Herrera otorgó a Mateo Marón y Francisco Javier Leal vendiendoles
una estancia en el páramo del Contento jurisdicción de Chiscas y
G

In [126]:
# Load pre-trained and custom NER models for extraction
import spacy
nlp_es = spacy.load("es_core_news_lg")
nlp_custom = spacy.load("ner_model/model/model-best")

# Load list of names
import csv
names = []
last_names = []

with open('ner_model/data/processed/first_names_processed.csv', 'r', newline="") as file:
    reader = csv.reader(file, delimiter=",")
    for row in reader:
        names.append(row[0])

with open('ner_model/data/processed/last_names_processed.csv', 'r', newline='') as file:
    reader = csv.reader(file, delimiter=",")
    for row in reader:
        last_names.append(row[0])

# Add names and last names to entity ruler
ruler = nlp_es.add_pipe("entity_ruler", after="ner", config={"overwrite_ents":True})

# Create patterns
patterns = []
for name in names:
    patterns.append({
        "label":"FIRST_NAME",
        "pattern": [{"LOWER":name.lower()}]
    })

for name in last_names:
    patterns.append({
        "label":"LAST_NAME",
        "pattern": [{"LOWER":name.lower()}]
    })

# Add patterns to pipeline
ruler.add_patterns(patterns)

In [129]:
# Extract names
names = []

for block in text_blocks:
    doc = nlp_custom(block)
    for ent in doc.ents:
        if ent.label_ == "LAST_NAME" or ent.label_ == "FIRST_NAME":
            print("## Default model results: ", ent, ent.label_)
            tokens = ent.text.split(" ")
            for token in tokens:
                name = nlp_es(token)
                for name_ent in name.ents:
                    if name_ent.label_ == "LAST_NAME" or name_ent.label_ == "FIRST_NAME":
                        print("     -- Default model results: ", name_ent, name_ent.label_)
                


## Default model results:  648 FIRST_NAME
## Default model results:  -62en FIRST_NAME
## Default model results:  Chiscote LAST_NAME
## Default model results:  334)--Esc FIRST_NAME
## Default model results:  .292-- El Cocuy 25 LAST_NAME
## Default model results:  1858- FIRST_NAME
## Default model results:  Eleuterio LAST_NAME
     -- Default model results:  Eleuterio LAST_NAME
## Default model results:  Julian FIRST_NAME
     -- Default model results:  Julian LAST_NAME
## Default model results:  Patricio LAST_NAME
     -- Default model results:  Patricio LAST_NAME
## Default model results:  Joaquín FIRST_NAME
## Default model results:  Espinel FIRST_NAME
     -- Default model results:  Espinel LAST_NAME
## Default model results:  acción FIRST_NAME
## Default model results:  Alizal FIRST_NAME
## Default model results:  ycortade ra FIRST_NAME
## Default model results:  vereda do LAST_NAME
     -- Default model results:  vereda LAST_NAME
## Default model results:  laa FIRST_NAME
## Default

In [156]:
for row in first_names[0]:
    for i in range(len(last_names[0])):
        if row in last_names[0][i]:
            new_table = last_names.drop(i)

KeyboardInterrupt: 

In [None]:
new_table

In [16]:
text = document_text.split()

In [17]:
i = 0

In [18]:
modified_text = []
while i < len(text):
    print(f"i: {i}")
    if i+1 < len(text):
        combined = text[i] + text[i+1]
        #print("Combined element: " + combined)
        doc = nlp(combined)
        if len(doc.ents) == 0:
            modified_text.append(text[i])
            i += 1
        else:
            for ent in doc.ents:
                print(doc.ents)
                print(ent.label_)
                if ent.label_ == "PER":
                    print("Person!")
                    modified_text.append(combined)
                    i += 2
                    break
                else:
                    print("Not a person.")
                    modified_text.append(text[i])
                    i += 1

i: 0
i: 1
i: 2
i: 3
i: 4
i: 5
(laLoma,)
LOC
Not a person.
i: 6
i: 7
i: 8
i: 9
i: 10
(paraChiscote,)
MISC
Not a person.
i: 11
(Chiscote,)
MISC
Not a person.
i: 12
(laque,)
PER
Person!
i: 14
(huboen,)
MISC
Not a person.
i: 15
(enel,)
PER
Person!
i: 17
(repartodel,)
PER
Person!
i: 19
(resguardode,)
LOC
Not a person.
i: 20
i: 21
i: 22
i: 23
(ElCocuy,)
LOC
Not a person.
i: 24
i: 25
i: 26
i: 27
(juliode,)
PER
Person!
i: 29
(1858-Eleuterio,)
MISC
Not a person.
i: 30
i: 31
(yJulian,)
MISC
Not a person.
i: 32
(JulianPatricio,)
LOC
Not a person.
i: 33
(Patriciovenden,)
PER
Person!
i: 35
(aJoaquín,)
MISC
Not a person.
i: 36
(Joaquín1,)
ORG
Not a person.
i: 37
i: 38
(aríaEspinel,)
LOC
Not a person.
i: 39
i: 40
i: 41
(derechoy,)
PER
Person!
i: 43
(acciónque,)
LOC
Not a person.
i: 44
(quea,)
PER
Person!
i: 46
(cadauno,)
PER
Person!
i: 48
(lecorresponde,)
LOC
Not a person.
i: 49
(correspondeen,)
PER
Person!
i: 51
(lastierras,)
PER
Person!
i: 53
(delAlizal,)
MISC
Not a person.
i: 54
(Alizal,)
PER
Pers

In [319]:
" ".join(modified_text)

"648 -62en el citio de la Loma donde se divide para Chiscote, laque hubo enel repartodel resguardo de Guacamayas.334)--Esc .292-- El Cocuy 25 de juliode 1858- Eleuterio y Julian Patriciovenden a Joaquín 1 aría Espinel el derechoy acción quea cadauno le correspondeen lastierras del Alizal,ycortade ra vereda do. laa Tapias jurisdicción de Chiscas, las que hubieronen el repartodel resguardo de índíjenas de dicho Chiscas.------355)--Esc #293-- El Cocuy 25 de juliode 1858 Juan Crisostomo Rodríguez, vendor a Gavino Largo un pedazo detierra enel sitio de Bocachica jurisdicción de la Capilla; el que hubo por compra Π Javier Solano y Gregorio Murillo o asu pr 536)--Eso . 294-- ElCocuy 25 de juliode 1858-- Con éste Número y enla fechaqueda protocolado el documento que Antonio José de Herreraotorgó a Mateo Marón y Francisco Javier Leal vendiendoles una estanciaen el páramodel Contento jurisdicción de Chiscasy Guicon 357)--Folio 512-- El Cocuy 1° de setiembre de 1807-- Antonio José de Herrera, ven

In [320]:
document_text = " ".join(modified_text)
print(document_text)

648 -62en el citio de la Loma donde se divide para Chiscote, laque hubo enel repartodel resguardo de Guacamayas.334)--Esc .292-- El Cocuy 25 de juliode 1858- Eleuterio y Julian Patriciovenden a Joaquín 1 aría Espinel el derechoy acción quea cadauno le correspondeen lastierras del Alizal,ycortade ra vereda do. laa Tapias jurisdicción de Chiscas, las que hubieronen el repartodel resguardo de índíjenas de dicho Chiscas.------355)--Esc #293-- El Cocuy 25 de juliode 1858 Juan Crisostomo Rodríguez, vendor a Gavino Largo un pedazo detierra enel sitio de Bocachica jurisdicción de la Capilla; el que hubo por compra Π Javier Solano y Gregorio Murillo o asu pr 536)--Eso . 294-- ElCocuy 25 de juliode 1858-- Con éste Número y enla fechaqueda protocolado el documento que Antonio José de Herreraotorgó a Mateo Marón y Francisco Javier Leal vendiendoles una estanciaen el páramodel Contento jurisdicción de Chiscasy Guicon 357)--Folio 512-- El Cocuy 1° de setiembre de 1807-- Antonio José de Herrera, vend

In [326]:
text = "Maria Juana Vega y Patricio Lopez se casaron hace 10 años"
doc = nlp(text)
for ent in doc.ents:
    print(ent)
    print(ent.label_)
    print(ent.ent_id_)

Maria
LAST_NAME

Juana
LAST_NAME

Vega
LAST_NAME

Patricio
FIRST_NAME

Lopez
LAST_NAME

