In [None]:
%%capture
!sudo pip install names-dataset

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
from typing import Iterable

from names_dataset import NameDataset
from more_itertools import flatten, unique_everseen


class NamesDatabase(NameDataset):
    def __init__(self, country_codes: list[str] = []):
        self.country_codes = set(country_codes)
        super().__init__(load_first_names=True, load_last_names=True)
        if country_codes:
            self.first_names = {
                k: v
                for k, v in self.first_names.items()
                if set(v["country"].keys()) & self.country_codes
            }
            self.first_names = {
                k: v
                for k, v in self.last_names.items()
                if set(v["country"].keys()) & self.country_codes
            }
                                           
        self.unique_first_names = self.build_unique_names(self.first_names)
        self.unique_last_names = self.build_unique_names(self.last_names)

    @staticmethod
    def build_unique_names(names: Iterable[str]) -> list[str]:
        names = (
            map(lambda x: (x, info), name.split(" ")) for name, info in names.items()
        )
        names = flatten(names)
        names = filter(lambda x: len(x) > 3, names)
        names = sorted(names, key=lambda x: max(x[1]["rank"].values(), default=9999999))
        names = unique_everseen(names, key=lambda x: x[0])
        return dict(names)

In [None]:
from spacy.language import Language
from spacy.pipeline import EntityRuler


@Language.factory(name="name_lookup_ruler")
def name_lookup(nlp, name, country_codes=["AR"], overwrite_ents: bool = True):
    name_db = NamesDatabase(country_codes=country_codes)
    ruler = EntityRuler(nlp, name=name, overwrite_ents=overwrite_ents)
    # first names when they are titled
    ruler.add_patterns(
        [
            {
                "label": "PER",
                "id": "PER",
                "pattern": [{"LOWER": name.lower(), "IS_TITLE": True}],
            }
            for name in name_db.unique_last_names.keys()
        ]
    )
    return ruler


In [None]:
from aymurai.spacy.ner import SpacyNER
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        # (
        #     SpacyRulerPipeline,
        #     {
        #         "base": "es",
        #         "steps": [
        #             ("name_lookup_ruler", {'country_codes': ['AR']})
        #         ],
        #     },
        # ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed_train = pipeline.preprocess(train)

In [None]:
import spacy

nlp = spacy.blank('es')
nlp.add_pipe('name_lookup_ruler')

In [None]:

text = preprocessed_train[0]['data']['doc.text']
doc = nlp(text)

# render(doc)

In [None]:
from spacy import displacy

displacy.render(doc, 'ent')