In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
from typing import Any
from copy import deepcopy
from functools import partial

import spacy
from spaczz.pipeline import SpaczzRuler

import aymurai.spacy.components


@spacy.language.Language.factory("aymurai_spaczz_ruler")
def aymurai_date_fuzzy_matcher(
    nlp,
    name,
    patterns: list[dict] = {},
):
    ruler = SpaczzRuler(nlp)
    ruler.add_patterns(patterns)
    return ruler


def __filter_entities(
    doc: spacy.language.Language, enable: list[str] = []
) -> spacy.language.Language:
    ents = [ent for ent in doc.ents if ent.label_ in enable]
    doc.ents = tuple(ents)

    return doc
@spacy.language.Language.factory("filter_entities")
def pipe_filter_entities(nlp, name, enable: list[str]):
    return partial(__filter_entities, enable=enable)

In [None]:
from aymurai.spacy.models.ner import SpacyNER
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract


config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es_core_news_lg",
                "steps": [
                    (
                        "filter_entities",
                        {
                            "enable": [
                                "PER",
                            ]
                        },
                    ),
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "TIME": [
                                    "%H:%M",
                                    "%-H(.|:)%M (?i)horas",
                                    "%-H.%M h(rs|r)\.?",
                                ],
                                "DATE": [
                                    "%d/%m/%Y",
                                    "%d/%m/%y",
                                    "(?i)%d de %B del? %Y",
                                ],
                            },
                        },
                    ),
                ],
            },
        ),
    ],
    "models": [
        (
            SpacyNER,
            {
                "base_config": "ner",
                "batch_size": 8,
            },
        )
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)


In [None]:
import srsly

registry = preprocessed_train[6]

metadata = {
    k: v for k, v in registry["metadata"].items() if type(v) not in [dict, list]
}
# print(json.dumps(metadata, indent=4))
print(srsly.yaml_dumps(registry, indent_offset=4, indent_mapping=4, indent_sequence=6))

render(registry)


In [None]:
# predict_train, predict_val = pipeline.fit(preprocessed_train, preprocessed_val)
pipeline.fit(preprocessed_train, preprocessed_val)

In [None]:
pipeline.models

In [None]:
import spacy
doc = pipeline.models[0].nlp(preprocessed_train[0]['data']['doc.text'])

spacy.displacy.render(doc, 'ent')