In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
sample, _ = train_test_split(private, train_size=10, random_state=22)

In [None]:
from typing import Any
from copy import deepcopy
from functools import partial

import spacy
from spaczz.pipeline import SpaczzRuler

import aymurai.spacy.components


@spacy.language.Language.factory("aymurai_spaczz_ruler")
def aymurai_date_fuzzy_matcher(
    nlp,
    name,
    patterns: list[dict] = {},
):
    ruler = SpaczzRuler(nlp)
    ruler.add_patterns(patterns)
    return ruler


def __filter_entities(
    doc: spacy.language.Language, enable: list[str] = []
) -> spacy.language.Language:
    ents = [ent for ent in doc.ents if ent.label_ in enable]
    doc.ents = tuple(ents)

    return doc
@spacy.language.Language.factory("filter_entities")
def pipe_filter_entities(nlp, name, enable: list[str]):
    return partial(__filter_entities, enable=enable)

In [None]:
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.core import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.pattern.ar.articles import (
    LAW_CODES,
    LAW_PATTERN,
    LAW_CODES_ABBRS,
    ART_PATTERN_MULTI_MOD,
    ART_PATTERN_MULTI_PREFIX,
)

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es_core_news_lg",
                "steps": [
                    (
                        "filter_entities",
                        {
                            "enable": [
                                "PER",
                            ]
                        },
                    ),
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "TIME": [
                                    "%H:%M",
                                    "%-H(.|:)%M (?i)horas",
                                    "%-H.%M h(rs|r)\.?",
                                ],
                                "DATE": [
                                    "%d/%m/%Y",
                                    "%d/%m/%y",
                                    "(?i)%d de %B del? %Y",
                                ],
                            },
                        },
                    ),
                    # (
                    #     "fuzzy_ruler",
                    #     {
                    #         "patterns": [
                    #             {
                    #                 "label": "AYMURAI_SECTION_BREAKPOINT",
                    #                 "patterns": [
                    #                     "DESARROLLO:",
                    #                     "DECIDE:",
                    #                     "DECIDO:",
                    #                     "ANTECEDENTES:",
                    #                 ],
                    #             },
                    #         ],
                    #     },
                    # ),
                    # (
                    #     "spaczz_ruler",
                    #     {
                    #         "patterns": [
                    #             {
                    #                 "label": "AYMURAI_ARTICLE",
                    #                 "type": "regex",
                    #                 "pattern": ART_PATTERN_MULTI_PREFIX,
                    #             },
                    #             {
                    #                 "label": "AYMURAI_ARTICLE",
                    #                 "type": "regex",
                    #                 "pattern": ART_PATTERN_MULTI_MOD,
                    #             },
                    #             {
                    #                 "label": "AYMURAI_CODE_OR_LAW",
                    #                 "type": "regex",
                    #                 "pattern": f"{LAW_PATTERN}",
                    #             },
                    #             # *[
                    #             #     {
                    #             #         "label": "AYMURAI_CODE_OR_LAW",
                    #             #         "type": "fuzzy",
                    #             #         "pattern": code,
                    #             #         "kwargs": {"mir_r2": 100},
                    #             #     }
                    #             #     for code in LAW_CODES
                    #             # ],
                    #             *[
                    #                 {
                    #                     "label": "AYMURAI_CODE_OR_LAW",
                    #                     "type": "regex",
                    #                     # "pattern": code,
                    #                     "pattern": f"{abbr}(\s?CABA)?",
                    #                     # "kwargs": {"ignore_case": False},
                    #                 }
                    #                 for abbr in LAW_CODES_ABBRS
                    #             ],
                    #         ],
                    #     },
                    # ),
                ],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
a = pipeline.preprocess(sample)


In [None]:
import json
import srsly

registry = a[2]

metadata = {
    k: v for k, v in registry["metadata"].items() if type(v) not in [dict, list]
}
# print(json.dumps(metadata, indent=4))
print(srsly.yaml_dumps(registry, indent_offset=4, indent_mapping=4, indent_sequence=6))

render(registry)