In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
import aymurai.spacy.components
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [("aymurai_violence_quotes_ruler", {})],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed = pipeline.preprocess(private)


In [None]:
def have_quotes_entities(item)-> bool:
    if 'data' not in item:
        return False
    if 'entities' not in item['data']:
        return False
    
    labels = list(map(lambda x: x['label'], item['data']['entities']))
    return 'AYMURAI_VIOLENCE_QUOTE' in labels
    
with_quotes = filter(have_quotes_entities, preprocessed)
with_quotes = list(with_quotes)

In [None]:
len(with_quotes)

In [None]:
import spacy
nlp = spacy.blank('es')

doc = nlp(with_quotes[0]['data']['doc.text'])

In [None]:

doc.spans['sc'] = [doc[30:50]]

In [None]:
from spacy import displacy
displacy.render(doc, 'span')

In [None]:
with_quotes

In [None]:
import srsly

registry = preprocessed_train[6]

metadata = {
    k: v for k, v in registry["metadata"].items() if type(v) not in [dict, list]
}
# print(json.dumps(metadata, indent=4))
print(srsly.yaml_dumps(registry, indent_offset=4, indent_mapping=4, indent_sequence=6))

render(registry)


In [None]:
# predict_train, predict_val = pipeline.fit(preprocessed_train, preprocessed_val)
pipeline.fit(preprocessed_train, preprocessed_val)

In [None]:
pipeline.models

In [None]:
import spacy
doc = pipeline.models[0].nlp(preprocessed_train[0]['data']['doc.text'])

spacy.displacy.render(doc, 'ent')