In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale
import random

from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset
from aymurai.spacy.display import DocRender

render = DocRender()

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')

In [None]:
from sklearn.model_selection import train_test_split


def demonym_annotated(item) -> bool:
    annotations = item['annotations']
    genders = [x['nacionalidad_acusado/a'] for x in annotations]
    genders += [x['nacionalidad_denunciante'] for x in annotations]
    genders = filter(bool, genders)
    genders = list(genders)
    return bool(genders)


private = ArgentinaJuzgadoPCyF10Dataset('private')
private = filter(demonym_annotated, private)
private = list(private)
sample, _ = train_test_split(private, train_size=20, random_state=22)

private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))


## Define spacy component

We define an entity ruler using the demonyms created on `00-demonyms-database.ipynb`

> `dev note:`
>
> using the `%%export` magic from `aymurai.devtools.magic`, we can build the aymurai submodule component directly

In [None]:
%%export aymurai.spacy.components.es.norp_ruler

from glob import glob
import os  # export: hide
import shutil  # export: hide

import spacy
import pandas as pd
from spacy.pipeline import EntityRuler

from aymurai.devtools import resolve_package_path

DEMONYMS_BASEPATH = resolve_package_path('aymurai.data.spanish.demonyms')

# export: start hide
os.makedirs(DEMONYMS_BASEPATH, exist_ok=True)
shutil.copy('es-demonyms-global.csv', DEMONYMS_BASEPATH)
# export: end hide

database = pd.concat(
    [pd.read_csv(path) for path in glob(f'{DEMONYMS_BASEPATH}/*.csv')],
    ignore_index=True,
)
DEMONYMS = database["demonym"].to_list()


@spacy.language.Language.factory("aymurai_norp_ruler")
def es_norp_ruler(nlp, name):
    ruler = EntityRuler(
        nlp,
        patterns=[{"label": "NORP", "pattern": demonym} for demonym in DEMONYMS],
    )
    return ruler

In [None]:
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import JunkCleaner, TextNormalize

# import aymurai.spacy.components

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            JunkCleaner,
            {
                "patterns": [
                    "Juzgado PCyF N* 10 - Tacuarí 138, 7* Piso - juzcyf10ejusbaires.gob.ar - 4014-6821/20 - Gipcyf10",
                ]
            },
        ),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    ("aymurai_norp_ruler", {}),
                ],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import json

import spacy
from more_itertools import flatten

norp = filter(lambda x: any(map(lambda y: y['label'] == 'NORP', x['data']['entities'])), result)
norp = filter(lambda x: len(x['annotations']) > 1, norp)
norp = list(norp)

registry = norp[9]
# registry = result[15]
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(registry['path'])
print(json.dumps(metadata, indent=4))
print('annotated demonyms')

gender1 = map(lambda x: x['nacionalidad_acusado/a'], registry['annotations'])
gender2 = map(lambda x: x['nacionalidad_denunciante'], registry['annotations'])
print(list(flatten([gender1, gender2])))

print('\n-------\n')
render(registry, 'span', spans_key='section')

In [None]:
import json
import pandas as pd


df = pd.DataFrame(registry['annotations'])
campos_de_interes = ['genero_acusado/a', 'persona_acusada_no_determinada', 'nacionalidad_acusado/a', 'edad_acusado/a al momento del hecho', 'nivel_instruccion_acusado/a' ,'genero_denunciante', 'nacionalidad_denunciante', 'edad_denunciante_al_momento_del_hecho', 'nivel_instruccion_denunciante', 'domicilio_denunciante']
df[campos_de_interes]

In [None]:
import pandas as pd
from more_itertools import collapse
import pickle

paths = pd.read_csv('casos-lio.csv', header=None).values
paths = collapse(paths)
paths = list(paths)
print(paths)

muestra = filter(lambda x: x['path'] in paths, preprocess)
muestra = list(muestra)
muestra
with open('muestra.pickle', 'wb') as file:
    pickle.dump(muestra, file)

preprocess = pipeline.preprocess(test)
result = pipeline.predict(preprocess)

with open('preprocessed-test.pickle', 'wb') as file:
    pickle.dump(preprocess, file)

preprocess = pipeline.preprocess(val)
result = pipeline.predict(preprocess)

with open('preprocessed-val.pickle', 'wb') as file:
    pickle.dump(preprocess, file)

preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

with open('preprocessed-train.pickle', 'wb') as file:
    pickle.dump(preprocess, file)

