In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, "es_AR.UTF-8")

render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

print('private', len(private))
print('public', len(public))
print('---')
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
docs = ArgentinaJuzgadoPCyF10Dataset('private-docs', use_cache=True)
docs = filter(lambda x: 'admisibilidad_prueba' not in x['metadata']['objeto_de_la_resolucion'], docs)
docs = filter(lambda x: x['metadata']['violencia_de_genero'], docs)
docs = list(docs)
print('docs', len(docs))

In [None]:
import re
import hashlib
from copy import deepcopy
from itertools import groupby, filterfalse, zip_longest

import regex
import spacy
import srsly
import pandas as pd
from spacy.tokens import Span
from more_itertools import flatten, collapse, unique_everseen

from aymurai.meta.types import DataItem
from aymurai.spacy.utils import format_entity
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.meta.pipeline_interfaces import Transform
from aymurai.spacy.components.regex import EnhancedRegexRuler, EnhancedRegexMatcher
import unidecode


class SpacyRulerViolencePatterns(Transform):
    def __init__(self):
        patterns = {}
        patterns["MODALIDAD_VIOLENCIA"] = [
            r"(?i)(modalidad de )?violencia dom[ée]stica",
            r"(?i)(modalidad de )?violencia institucional",
            r"(?i)(modalidad de )?violencia laboral",
            r"(?i)(modalidad de )?violencia libertad\sreproductiva",
            r"(?i)(modalidad de )?violencia obstetrica",
        ]
        patterns["CONTEXTO_VIOLENCIA"] = [
            r"(?i)(contexto de)? violencia (de )?\w+",
            r"(?i)(contexto de)? violencia contra las? mujer(es)?",
            r"(?i)violencia de g[ée]nero",
            r"(?i)(violencia )?f[íi]sica",
            r"(?i)(violencia )?psicol[óo]gica",
            r"(?i)(violencia )?econ[óo]mica",
            r"(?i)(violencia )?sexual",
            r"(?i)(violencia )?social",
            r"(?i)(violencia )?ambiental",
            r"(?i)(violencia )?pol[íi]tica",
            r"(?i)(violencia )?simb[óo]lica",
        ]
        self.keywords = {
            "genero",
            "fisica",
            "ambiental",
            "psicologica",
            "economica",
            "sexual",
            "social",
            "politica",
            "simbolica",
        }

        self.ruler = SpacyRulerPipeline(
            base="es", steps=[("enhanced_regex_ruler", {"patterns": patterns})]
        )

    def postprocess(self, entity):
        entity = deepcopy(entity)
        text = unidecode.unidecode(entity["text"].lower())
        text = re.sub(r'_-,\.', '', text)
        words = set(text.split(" "))
        if not words & self.keywords:
            return

        return entity

    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        item = self.ruler(item)

        ents = []
        if "entities" in item["data"]:
            ents += item["data"]["entities"]

        # take articles from ents to work with
        contexts = filter(lambda x: x["label"] == "CONTEXTO_VIOLENCIA", ents)
        ents = filter(lambda x: x["label"] != "CONTEXTO_VIOLENCIA", ents)
        ents = list(ents)

        # post process entities
        contexts = map(self.postprocess, contexts)
        contexts = filter(bool, contexts)
        contexts = list(contexts)

        # restore entities
        ents += contexts
        ents = sorted(ents, key=lambda x: x["start"])
        item["data"]["entities"] = list(ents)
        # print(srsly.yaml_dumps(contexts))
        return item


In [None]:
import aymurai.spacy.components.loader
from aymurai.spacy.models.ner import SpacyNER
from aymurai.spacy.models.core import SpacyModel
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.utils.entities import FilterEntity
from aymurai.text.normalize import JunkCleaner, TextNormalize

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            JunkCleaner,
            {
                "patterns": [
                    "Juzgado PCyF N* 10 - Tacuarí 138, 7* Piso - juzcyf10ejusbaires.gob.ar - 4014-6821/20 - Gipcyf10",
                ]
            },
        ),
        (SpacyRulerViolencePatterns, {})
    ],
    "models": [
    ],
    "postprocess": [
    ],
    "multiprocessing": {},
    # "use_cache": True,
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
examples = filter(lambda x: x['metadata']['nro_registro'] == '3787', docs)
examples = list(examples)
examples

In [None]:
# preprocessed = pipeline.preprocess([docs[i] for i in range(100)])
# preprocessed = pipeline.preprocess([docs[i] for i in range(len(docs))])
preprocessed = pipeline.preprocess(examples)
result = pipeline.predict(preprocessed)
result = pipeline.postprocess(result)

In [None]:
import json

import pandas as pd

pd.set_option("display.max_columns", None)

# registry = result[40]
registry = result[0]
# registry = preprocessed[85]
# registry = examples[0]
print(registry['path'])
metadata = {
    k: v for k, v in registry["metadata"].items() if type(v) not in [dict, list]
}
print(json.dumps(metadata, indent=4))
annotations = registry['annotations']
display(pd.DataFrame(annotations, index=pd.Index(range(len(annotations)))))

print("\n-------\n")
if 'predictions' in registry:
    pred_cats = {
        k: v
        for k, v in registry["predictions"]["doc-cats"].items()
        if type(v) not in [dict, list]
    }
    display(pd.DataFrame(pred_cats, index=pd.Index([0])).T)


render = DocRender(config={"colors": {
    'AYMURAI_VIOLENCE_QUOTE': '#f95d6a',
    'CONTEXTO_VIOLENCIA': '#d45087',
    'FIRMA': '#007bb3',
    # 'FECHA_RESOLUCION': , 
    'ART_INFRINGIDO': '#665191',
    'CONDUCTA': '#665191',
    'N_EXPTE_EJE': '#2f4b7c',
}})
render(registry)

In [None]:
len(result)