In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, "es_AR.UTF-8")

render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=False)

print('private', len(private))

In [None]:
import aymurai.spacy.components.loader
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.utils.entities import FilterEntity
from aymurai.text.normalize import JunkCleaner, TextNormalize
from aymurai.models.dummy.decision import DummyExtractorDecision
from aymurai.models.dummy.n_expte_eje import DummyExtractorExpediente
from aymurai.models.dummy.affiliation import DummyExtractorAffiliation
from aymurai.spacy.components.es_ar.keywords import SpacyRulerKeywords
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser
from aymurai.models.dummy.art_infringido import DummyExtractorArtInfringido
from aymurai.models.dummy.tipo_resolucion import DummyExtractorTipoResolucion
from aymurai.models.dummy.fecha_resolucion import DummyExtractorFechaResolucion
from aymurai.spacy.components.es_ar.art_infringido import SpacyRulerArtInfringido
from aymurai.models.dummy.hora_inicio_cierre import DummyExtractorHoraInicioCierre

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            JunkCleaner,
            {
                "patterns": [
                    "Juzgado PCyF N* 10 - Tacuarí 138, 7* Piso - juzcyf10ejusbaires.gob.ar - 4014-6821/20 - Gipcyf10",
                ]
            },
        ),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "DATE": [
                                    "%-d/%-m/%Y",
                                    "%-d/%-m/%y",
                                    "%d/%m/%Y",
                                    "%d/%m/%y",
                                    "(?i)%-d de %B del? %Y",
                                ],
                                "TIME": [
                                    "%H(\.|:)%M",
                                    "%-H(.|:)%M (?i)horas",
                                    "%-H.%M h(rs|r|s)\.?",
                                ],
                                "N_EXPTE_EJE": [
                                    r"(?i)causa\s*(n.)?\s*\d+/%Y(-\d)?",
                                    r"(?i)causa\s*(n.)?\s*\d+/%y(-\d)?",
                                    r"(?i)caso\s*(n.)?\s*\d+/%Y(-\d)?",
                                    r"(?i)caso\s*(n.)?\s*\d+/%y(-\d)?",
                                    r"EXP:\s*\d+/%Y(-\d)?",
                                    r"EXP:\s*\d+/%y(-\d)?",
                                    r"IPP?\s*\d+/%Y(-\d)?",
                                    r"IPP?\s*\d+/%y(-\d)?",
                                ],
                            },
                        },
                    ),
                    # # frases violencia
                    # ("aymurai_violence_quotes_ruler", {}),
                    # # Names Ruler
                    # ("name_lookup_ruler", {"country_codes": ["AR"]}),
                    # ("join_consecutive_names", {}),
                ],
            },
        ),
        (
            AymuraiRulerSectionParser,
            {
                "base": "es",
                "breakpoints": {
                    "SECTION:DEVELOPMENT": [
                        "DESARROLLO",
                    ],
                    "SECTION:BACKGROUND": [
                        "ANTECEDENTES",
                        "ANTECEDENTES Y ARGUMENTOS",
                    ],
                    "SECTION:ARGUMENTS": [
                        "ARGUMENTOS",
                        "ANTECEDENTES Y ARGUMENTOS",
                        "CONSIDERO",
                    ],
                    "SECTION:DECISION": [
                        "DECID[EO]",
                        "RESUELV[EO]",
                    ],
                    "KEYWORDS": [
                        "PALABRAS[_\s]+CLAVE[\w\d\s_:]+",
                    ],
                },
            },
        ),
        (SpacyRulerArtInfringido, {}),
        (SpacyRulerKeywords, {}),
    ],
    "models": [
        (DummyExtractorFechaResolucion, {}),
        (DummyExtractorHoraInicioCierre, {}),
        (DummyExtractorExpediente, {}),
        (DummyExtractorDecision, {}),
        (DummyExtractorTipoResolucion, {}),
        (DummyExtractorAffiliation, {}),
    ],
    "postprocess": [(FilterEntity, {"entities": "PER"})],
    "multiprocessing": {},
    # "use_cache": True,
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
preprocessed = pipeline.preprocess([private[i] for i in range(1)])
# preprocessed = pipeline.preprocess(private)
result = pipeline.predict(preprocessed)
result = pipeline.postprocess(result)

In [None]:
import json

import pandas as pd

pd.set_option("display.max_columns", None)

registry = result[0]
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
annotations = registry['annotations']
pred_cats = {k: v for k, v in registry['predictions']['doc-cats'].items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))
display(pd.DataFrame(annotations, index=pd.Index(range(len(annotations)))))

print("\n-------\n")
display(pd.DataFrame(pred_cats, index=pd.Index([0])).T)
render(registry)