In [None]:
%load_ext autoreload
%autoreload 1
%aimport aymurai.pipeline
%aimport aymurai.pipeline.pipeline
%aimport aymurai.pipeline.preprocess
%aimport aymurai.pipeline.training
%aimport aymurai.utils.cache
%aimport aymurai.text.extraction
%aimport aymurai.models.spacy

In [None]:
import random
import locale 

from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')


In [None]:

def gender_annotated(item) -> bool:
    annotations = item['annotations']
    genders = [x['genero_acusado/a'] for x in annotations]
    genders += [x['genero_denunciante'] for x in annotations]
    genders = filter(bool, genders)
    genders = list(genders)
    return bool(genders)




private = ArgentinaJuzgadoPCyF10Dataset('private')
private = filter(gender_annotated, private)
private = list(private)

sample = random.sample(private, k=10)

In [None]:
from copy import deepcopy
from typing import Any
import spacy
from aymurai.pattern.fuzzytagging import FuzzyEntityTagger


@spacy.language.Language.factory("aymurai_fuzzy_ruler")
def aymurai_date_fuzzy_matcher(
    nlp,
    name,
    patterns: dict[str, list[str]] = {},
    min_r2: int = 80,
):
    matcher = FuzzyEntityTagger(nlp, min_r2_score=min_r2)
    for label, pattern in patterns.items():
        matcher.add(label, pattern)
    return matcher

In [None]:
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import TextNormalize
from aymurai.models.spacy import SpacyPipeline

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
    ],
    "models": [
        (
            SpacyPipeline,
            {
                "base": "es",
                "steps": [
                    (
                        "aymurai_fuzzy_ruler",
                        {
                            "patterns": {
                                "AYMURAI_GENDER": [
                                    'mujer cis',
                                    'varon cis'
                                ],
                            },
                            # 'min_r2': 98,
                        },
                    ),
                ],
            },
        )
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
preprocess = pipeline.preprocess(sample)
result = pipeline.predict(preprocess)

In [None]:
import json
from more_itertools import flatten

registry = result[8]
metadata = {k: v for k, v in registry.items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))
print('annotated genders')

gender1 = map(lambda x: x['genero_acusado/a'], registry['annotations'])
gender2 = map(lambda x: x['genero_denunciante'], registry['annotations'])
print(list(flatten([gender1, gender2])))

print('\n-------\n')
doc = registry['data']['spacy.doc']
# doc = ruler(doc)

spacy.displacy.render(doc, 'ent')