In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import ArgentinaJuzgadoPCyF10LabelStudioAnnotations

dataset = ArgentinaJuzgadoPCyF10LabelStudioAnnotations('/resources/data/restricted/annotations/20221130-bis/').data

In [None]:
dataset[0]

In [None]:
from aymurai.models.flair.utils import FlairTextNormalize
from aymurai.transforms.entities import FilterEntity
from aymurai.pipeline import AymurAIPipeline

config = {
    "preprocess": [
        (FlairTextNormalize, {}),
        (FilterEntity, {'enable': ['DECISION'], 'field': 'annotations'}),
    ],
    "models": [
    ],
    "postprocess": [
    ],
    "multiprocessing": {},
    # "use_cache": True,
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed = pipeline.preprocess(dataset)

In [None]:
import pandas as pd
from aymurai.utils.misc import get_element


def get_ent_info(entity: dict):
    text = entity['text']
    subclass = get_element(entity, ['attrs', 'aymurai_label_subclass', 0]) or ''
    return (text, subclass)

def gen_clases(item):
    path = item['path'].replace('\/', '/')
    nro_registro = get_element(item, ['metadata','nro_registro']) or ''
    tomo = get_element(item, ['metadata','tomo']) or ''
    doc = item['data']['doc.text'].strip()
    sentences = doc.splitlines()
    sentences = list(sentences)

    decision_pair = [get_ent_info(a) for a in item['annotations']['entities']]

    data = []
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        is_decision = False
        hace_lugar = False
        for pair in decision_pair:
            is_decision = sentence in pair[0]
            if not is_decision:
                continue
            hace_lugar = 'hace_lugar' == pair[1]
            break
        data += [(path, nro_registro, tomo, sentence, is_decision, hace_lugar)]
    df = pd.DataFrame(data, columns=['path', 'nro_registro', 'tomo', 'sentence', 'decision', 'hace_lugar'])
    return df
    


In [None]:
gen_clases(preprocessed[2])

In [None]:
data = pd.concat([gen_clases(item) for item in preprocessed], ignore_index=True)

In [None]:
print(f"decisiones: {len(data.query('decision'))} from {len(data)} sentences")
print(f"hace lugar: {len(data.query('decision and hace_lugar'))} from {len(data.query('decision'))} decisiones")

In [None]:
data.to_csv('sentences-decision.csv', index=False)