In [None]:
# !sudo pip install -e /workspace/src/aymurai

## Load model

In [None]:
import flair, torch

from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import SpaceTokenizer


flair.device = torch.device("cpu")
torch.cuda.is_available()

In [None]:
path = "/resources/ner/flair/no-finetune-221123-no-decision/"

# load model
tagger = SequenceTagger.load(path + "model.pt")

## Prediction formatting

In [None]:
import re
import random

from glob import glob
from spacy import displacy
from aymurai.text.extraction import extract_document

In [None]:
doc_paths = glob("/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO/**/**.docx")
len(doc_paths)

In [None]:
doc_path = random.choice(doc_paths)
doc_path

In [None]:
doc = extract_document(doc_path)

In [None]:
set(re.findall(r"\s", doc))

In [None]:
# replace '\t' and '\xa0' for white space
doc = re.sub(r"(?:\t|\xa0)+", " ", doc)

# remove multiple spaces except new lines
doc = re.sub(r"[^\S\r\n]+", " ", doc)

# replace multiple new lines with just one break
doc = re.sub(r"\n+", "\n", doc)

In [None]:
predicted = []
dicts_ents = []

# displacy - line by line
doc_lines = doc.splitlines()
for line in doc_lines:
    sentence = Sentence(line, use_tokenizer=SpaceTokenizer())
    tagger.predict(sentence)

    dic_ents = {
        "text": line,
        "ents": [
            {
                "start": ent.start_position,
                "end": ent.end_position,
                "label": ent.labels[0].value
            }
            for ent in sentence.get_spans('ner')
        ],
        "title": None,
    }

    if sentence:
        displacy.render(dic_ents, manual=True, style="ent")

    predicted.append(sentence.get_spans("ner"))
    dicts_ents.append(dic_ents)

#### AymurAI format

In [None]:
from numpy import cumsum
from more_itertools import flatten

In [None]:
def extract_spans(sentences: list[flair.data.Span]):
    pattern = r"Span\[(\d+):(\d+)\]"
    entities = []
    if sentences:
        for sentence in sentences:
            label = sentence.get_label()
            text = label.data_point.text
            label_value = label.value
            score = label.score
            start, end = re.findall(pattern, label.labeled_identifier)[0]
            start_char = label.data_point.start_position
            end_char = label.data_point.end_position
            
            ents = {
                "start_token": int(start),
                "end_token": int(end),
                "label": label_value,
                "text": text,
                "start": start_char,
                "end": end_char,
                "attrs": {
                    "score": score,
                    "aymurai_method": "ner",
                }
            }
            
            entities.append(ents)
    
    return entities

In [None]:
# number of tokens and characters per line
n_tokens = [len(line.split()) for line in doc_lines]
n_chars = [len(line) for line in doc_lines]

In [None]:
spans = []
for pred in predicted:
    spans_ = extract_spans(pred)
    spans.append(spans_)
    
accumulated_tokens = cumsum(n_tokens)
for i, _ in enumerate(accumulated_tokens):
    if i != 0:
        if spans[i]:
            for span in spans[i]:
                span["start_token"] += accumulated_tokens[i-1] + i
                span["end_token"] += accumulated_tokens[i-1] + i

accumulated_chars = cumsum(n_chars)
for i, _ in enumerate(accumulated_chars):
    if i != 0:
        if spans[i]:
            for span in spans[i]:
                span["start"] += accumulated_chars[i-1] + i
                span["end"] += accumulated_chars[i-1] + i

spans_lists = list(flatten(spans))

In [None]:
# displacy - whole document
dictsplacy = {
    "text": doc,
    "ents": spans_lists
}

displacy.render(dictsplacy, manual=True, style="ent")