In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

# load public dataset

In [None]:
from aymurai.spacy.display import DocRender
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

render = DocRender()

In [None]:
from sklearn.model_selection import train_test_split

dataset = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

dataset = filter(lambda x: x['metadata']['frases_agresion'], dataset)
dataset = list(dataset)

sample = dataset[:10]

# build pipeline

In [None]:
%%export aymurai.spacy.components.es_ar.quotes
import spacy
import pandas as pd
from spaczz.pipeline import SpaczzRuler
from more_itertools import unique_everseen

from aymurai.devtools import resolve_package_path

QUOTES_BASEPATH = resolve_package_path("aymurai.data.spanish")
QUOTES_FILENAME = f"{QUOTES_BASEPATH}/violence-quotes.csv"
# export: start hide
quotes_db = pd.read_csv("frases_agresion.csv", header=None, sep="|")
quotes_db = unique_everseen(quotes_db[0].values)
quotes_db = list(quotes_db)
quotes_db = pd.Series(quotes_db)
quotes_db.to_csv(QUOTES_FILENAME, index=False, header=None)
# export: end hide

with open(QUOTES_FILENAME, "r") as file:
    QUOTES_DB = map(str.strip, file.readlines())
    QUOTES_DB = list(QUOTES_DB)


def format_pattern(pattern: str) -> dict[str, str]:
    tokens = pattern.split(" ")
    min_r2 = 95 if len(tokens) < 4 else 85
    return {
        "label": "AYMURAI_VIOLENCE_QUOTE",
        "type": "fuzzy",
        "pattern": pattern,
        "kwargs": {"min_r2": min_r2},
    }


@spacy.language.Language.factory("aymurai_violence_quotes_ruler")
def violence_quotes_ruler(nlp, name):
    ruler = SpaczzRuler(nlp, name=name)
    ruler.add_patterns([format_pattern(pattern) for pattern in QUOTES_DB])
    return ruler


In [None]:
# import aymurai.spacy.components.loader
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.pipeline.pipeline import AymurAIPipeline

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "extension": "odt",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    ("aymurai_violence_quotes_ruler", {}),
                ],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
preprocess = pipeline.preprocess(dataset)

# visualization

In [None]:
import json

registry = preprocess[38]
quotes = map(lambda x: x['frases_agresion'], registry['annotations'])
quotes = list(quotes)
print(registry['path'])
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))
print('frases agresion:')
for quote in quotes:
    print(quote)

render(registry)
# print('\n-------\n')
# doc = nlp(registry['data']['doc.text'])
# doc = ruler(doc)

# spacy.displacy.render(doc, 'ent')