In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2


In [None]:
from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

colors = {
    'SECTION:DECISION': 'red',
    'KEYWORDS': 'blue'

}
render = DocRender(config={'colors': colors})

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
docs = ArgentinaJuzgadoPCyF10Dataset('private-docs', use_cache=True)
docs = filter(lambda x: 'admisibilidad_prueba' not in x['metadata']['objeto_de_la_resolucion'], docs)
docs = list(docs)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# pipeline definition

## Dummy classifier

In [None]:
%%export aymurai.models.dummy.tipo_resolucion

from copy import deepcopy
from functools import reduce

import regex

from aymurai.utils.misc import get_element
from aymurai.meta.types import DataItem, DataBlock
from aymurai.meta.pipeline_interfaces import TrainModule


class DummyExtractorTipoResolucion(TrainModule):
    def save(self, path: str):
        return

    def load(self, path: str):
        return

    def fit(self, train: DataBlock, val: DataBlock):
        return

    def predict(self, data: DataBlock) -> DataBlock:
        data = [self.predict_single(item) for item in data]

        return data

    def predict_single(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        # format prediction
        if "predictions" not in item:
            item["predictions"] = {}
        if "records" not in item["predictions"]:
            item["predictions"]["records"] = {}
        if "entities" not in item["predictions"]:
            item["predictions"]["entities"] = []
        if "doc-cats" not in item["predictions"]:
            item["predictions"]["doc-cats"] = {}
        item["predictions"]["doc-cats"]["tipo_de_resolucion"] = "interlocutoria"
        item["predictions"]["records"]["tipo_de_resolucion"] = ["interlocutoria"]

        # skip if there are no section parser
        if not (sections := get_element(item, ["data", "spans", "section"], [])):
            return item

        sections = filter(
            lambda x: x["label"] in ['SECTION:HEAD', "SECTION:DECISION", "KEYWORDS"],
            sections,
        )
        sections = sorted(sections, key=lambda e: e["start"])

        if not sections:
            return item

        text = reduce(lambda x, y: x + y, map(lambda z: z["text"], sections))
        # text = item['data']['doc.text']

        patterns = [r"(?i)interlocutoria"]

        interlocutoria = regex.findall(r"(?i)interlocutoria{e<=2}", text)
        definitiva = regex.findall(r"(?i)resoluci[oó]n[_\s]+definitiva{e<=2}", text)

        tipo_resolucion = "definitiva" if definitiva else "interlocutoria"

        item["predictions"]["records"]["tipo_de_resolucion"].append(tipo_resolucion)
        item["predictions"]["doc-cats"]["tipo_de_resolucion"] = tipo_resolucion

        return item


In [None]:
text = ' Resumen : resolución definitiva que declara extinguida la sanción por prescripción ( art . 43 Código Contravencional , en adelante Co ) . Buenos Aires , M/ de agosto de 2018 . ANTECEDENTESSECTION:BACKGROUND ; El día 22 de octubre de '
definitiva = regex.findall(r"(?i)resoluci[oó]n[_\s]+definitiva{e<=2}", text)

In [None]:

definitiva

In [None]:
import aymurai.spacy.components
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser
from aymurai.models.dummy.tipo_resolucion import DummyExtractorTipoResolucion

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            AymuraiRulerSectionParser,
            {
                "base": "es",
                "breakpoints": {
                    "SECTION:DEVELOPMENT": [
                        "DESARROLLO",
                    ],
                    'SECTION:BACKGROUND': [
                        "ANTECEDENTES",
                        "ANTECEDENTES Y ARGUMENTOS",
                        "Antecedentes del caso"
                    ],
                    "SECTION:ARGUMENTS": [
                        "ARGUMENTOS",
                        "ANTECEDENTES Y ARGUMENTOS",
                        'CONSIDERO'
                    ],
                    "SECTION:DECISION": [
                        "DECID[EO]",
                        "RESUELV[EO]",
                    ],
                    'KEYWORDS': [
                        "PALABRAS[_\s]+CLAVES?[\w\d\s_:]*",
                    ]
                },
            },
        ),
    ],
    "models": [
        (DummyExtractorTipoResolucion, {})
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import json
import random
import pandas as pd

definitiva = filter(lambda x: any(map(lambda y: y['tipo_de_resolucion'] == 'definitiva', x['annotations'])), result)
definitiva = list(definitiva)

# registry = result[random.choice(range(len(result)))]
registry = definitiva[3]
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))

annotations = registry['annotations']
print('annotations')
display(pd.DataFrame(annotations, index=pd.Index(range(len(annotations)))))

print('predictions')
if 'predictions' in registry:
    pred_cats = {
        k: v
        for k, v in registry["predictions"]["doc-cats"].items()
        if type(v) not in [dict, list]
    }
    display(pd.DataFrame(pred_cats, index=pd.Index([0])).T)


print('annotations')
print('decision:', [x['tipo_de_resolucion'] for x in registry['annotations']])
print('prediction')
print(registry['predictions']['doc-cats'])
print('decision:', registry['predictions']['doc-cats']['tipo_de_resolucion'])


print('\n-------\n')
# render(registry, 'span', spans_key='section')
render(registry)

# Evaluation

## train

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'tipo_de_resolucion']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)
references = references.rename(columns={'tipo_de_resolucion': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses = hypotheses.rename(columns={'tipo_de_resolucion': 'hypothesis'})

df = pd.merge(references, hypotheses, on='path')
report = classification_report(df['reference'], df['hypothesis'])
print(report)

## test

In [None]:
preprocess = pipeline.preprocess(test)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'tipo_de_resolucion']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)
references = references.rename(columns={'tipo_de_resolucion': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses = hypotheses.rename(columns={'tipo_de_resolucion': 'hypothesis'})

df = pd.merge(references, hypotheses, on='path')
df['exact_match'] = df['reference'] == df['hypothesis']

report = classification_report(df['reference'], df['hypothesis'])
print(report)

## docs

In [None]:
preprocess = pipeline.preprocess(docs)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'tipo_de_resolucion']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)
references = references.rename(columns={'tipo_de_resolucion': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses = hypotheses.rename(columns={'tipo_de_resolucion': 'hypothesis'})

df = pd.merge(references, hypotheses, on='path')
df['exact_match'] = df['reference'] == df['hypothesis']

report = classification_report(df['reference'], df['hypothesis'])
print(report)