In [None]:
%load_ext autoreload
%load_ext aymurai.devtools
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# Pipeline definition

In [None]:
from copy import deepcopy

import pandas as pd

from aymurai.meta.types import DataItem
from aymurai.meta.pipeline_interfaces import Transform


class ViolenceDocCategoryParser(Transform):
    categories = ['v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit']
    
    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        annotations = pd.DataFrame(item['annotations'])
        annotations = annotations[self.categories].any().to_list()

        item['data']['doc-cats'] = {
            f'{cat}': int(value) for cat, value in zip(self.categories, annotations)
        }
        return item

In [None]:
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (ViolenceDocCategoryParser, {}),
    ],
    "models": [
        (
            SpacyModel,
            {
                "base_config": "multilabel-doc-classifier",
                "batch_size": 8,
                "categories": ['v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit'],
                "categorizer_pipe": 'textcat_multilabel'
            },
        )
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)


# train model

In [None]:
pipeline.fit(preprocessed_train, preprocessed_val)

## load from checkpoint

In [None]:
# pipeline.models[0].load('/resources/cache/spacy/model/eaa2de1ff75912a4960f465409cdaeb1/model-best')

# prediction

In [None]:
pred_train = pipeline.predict(preprocessed_train[:100])

In [None]:
y_true = map(lambda x: x['data']['doc-cats'], preprocessed_train[:100])
y_true = pd.DataFrame(y_true)
y_true

In [None]:
y_pred = map(lambda x: x['prediction']['doc-cats'], pred_train)
y_pred = pd.DataFrame(y_pred)
y_pred

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_true, y_pred > 0.5)

In [None]:
from aymurai.meta.types import DataBlock
from sklearn.metrics import ConfusionMatrixDisplay

class Asd(object):
    def __init__(self, categories: list[str]):
        self.categories = categories
    
    def __call__(self, hypoteses: DataBlock, reference: DataBlock):
        y_true = map(lambda x: x['data']['doc-cats'], reference)
        y_true = pd.DataFrame(y_true)

        y_pred = map(lambda x: x['data']['doc-cats'], hypoteses)
        y_pred = pd.DataFrame(y_pred)
        
        return data

In [None]:
import spacy

item = preprocessed_train[1]
# doc = pipeline.models[0].nlp(preprocessed_train[0]['data']['doc.text'])
pipeline.predict([item])


# doc.cats