In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2


In [None]:
from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

colors = {
    'SECTION:DECISION': 'red',
    'SECTION:HEAD': 'red',
    'KEYWORDS': 'blue'

}
render = DocRender(config={'colors': colors})

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# pipeline definition

In [None]:
%%export aymurai.spacy.components.es_ar.keywords
from copy import deepcopy
from functools import reduce

import spacy
from spacy.tokens import Span

from aymurai.utils.misc import get_element
from aymurai.spacy.utils import format_entity
from aymurai.meta.pipeline_interfaces import Transform
from aymurai.spacy.components.utils import filter_overlapping_matches
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset
from aymurai.spacy.components.regex import EnhancedRegexMatcher

FIELDS = ["tipo_de_resolucion", "objeto_de_la_resolucion", "detalle", "decision"]
VALIDATION_FIELDS = ArgentinaJuzgadoPCyF10Dataset("validation-fields").data
VALIDATION_FIELDS = {k: v for k, v in VALIDATION_FIELDS.items() if k in FIELDS}


class SpacyRulerKeywords(Transform):
    def __init__(self):
        global __nlp
        __nlp = spacy.blank("es")
        self.matcher = EnhancedRegexMatcher(__nlp.vocab)

        for field, validations in VALIDATION_FIELDS.items():
            if field == "tipo_de_resolucion":
                validations = [f"resoluci[oó]n_{v}" for v in validations]

            validations += [v.replace("_", "[_\s]+") for v in validations]

            self.matcher.add(field, patterns=validations)

    def __call__(self, item):
        item = deepcopy(item)
        if not "entities" in item["data"]:
            item["data"]["entities"] = []

        # skip if there are no section parser
        if not (sections := get_element(item, ["data", "spans", "section"], [])):
            return item

        sections = filter(
            lambda x: x["label"] in ["KEYWORDS"],
            sections,
        )
        sections = sorted(sections, key=lambda e: e["start"])

        if not sections:
            return item

        offset = sections[0]["start"]

        doc = __nlp(item["data"]["doc.text"])
        matches = self.matcher(doc)

        # sort by score, lenght and position
        matches = filter(lambda x: x[1] > offset, matches)
        matches = sorted(matches, key=lambda x: (sum(x[3]), -(x[2] - x[1]), x[1]))
        matches = filter_overlapping_matches(matches)
        matches = list(matches)

        for label, start, end, score in matches:
            span = Span(doc, start=start, end=end, label=label)
            item["data"]["entities"] += [format_entity(span)]

        return item


In [None]:
import aymurai.spacy.components.loader
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import JunkCleaner, TextNormalize
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser
from aymurai.models.dummy.art_infringido import DummyExtractorArtInfringido
from aymurai.spacy.components.es_ar.art_infringido import SpacyRulerArtInfringido

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            JunkCleaner,
            {
                "patterns": [
                    "Juzgado PCyF N* 10 - Tacuarí 138, 7* Piso - juzcyf10ejusbaires.gob.ar - 4014-6821/20 - Gipcyf10",
                ]
            },
        ),
        (
            AymuraiRulerSectionParser,
            {
                "base": "es",
                "breakpoints": {
                    "SECTION:DEVELOPMENT": [
                        "DESARROLLO",
                    ],
                    'SECTION:BACKGROUND': [
                        "ANTECEDENTES",
                        "ANTECEDENTES Y ARGUMENTOS",
                        "Antecedentes del caso"
                    ],
                    "SECTION:ARGUMENTS": [
                        "ARGUMENTOS",
                        "ANTECEDENTES Y ARGUMENTOS",
                        'CONSIDERO'
                    ],
                    "SECTION:DECISION": [
                        "DECID[EO]",
                        "RESUELV[EO]",
                    ],
                    'KEYWORDS': [
                        "PALABRAS[_\s]+CLAVES?[\w\d\s_:]*",
                    ]
                },
            },
        ),
        (SpacyRulerKeywords, {}),
    ],
    "models": [
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocess = pipeline.preprocess(train[:40])
result = pipeline.predict(preprocess)

In [None]:
import json
import random

index = random.choice(range(len(result)))
index=24
print(index)
registry = result[index]


print(registry['path'])
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))


print('annotations')
print('art infingido:', [x['art_infringido'] for x in registry['annotations']])
print('codigo:', [x['codigo_o_ley'] for x in registry['annotations']])
print('conducta:', [x['conducta'] for x in registry['annotations']])
print('conducta detalle:', [x['conducta_descripcion'] for x in registry['annotations']])
print('prediction')
# print(registry['predictions'])
# print('art_infringido:', registry['predictions']['records']['art_infringido'])
# print('conducta:', registry['predictions']['records']['conducta'])


print('\n-------\n')
render(registry)

# Evaluation

## train

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
from itertools import zip_longest

import numpy as np
import pandas as pd
import seaborn as sns
from jiwer import cer
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem

EMPTY_ENTITY = {
    key: None for key in ["text", "start", "end", "label", "start_char", "end_char"]
}


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    annots = item["annotations"]
    df = pd.DataFrame(annots)
    df.insert(0, "path", path)

    df = df[["path", "art_infringido", "conducta", "codigo_o_ley"]]
    return df

def get_text(value):
    if not isinstance(value, dict):
        return value
    return value.get('text', '')

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    records = item["predictions"]["records"]
    max_ = max(map(lambda x: len(x), records.values()))
    if max_ > 1:
        print(records)
    if not max_:
        return pd.DataFrame({"path": path}, index=pd.Index([0]))

    for key, record in records.items():
        records[key] = [val for val, _ in zip_longest(record, range(max_))]

    df = pd.DataFrame(records)

    df["art_infringido"] = df["art_infringido"].apply(get_text)
    df["conducta"] = df["conducta"].apply(get_text)
    df["codigo_o_ley"] = df["codigo_o_ley"].apply(get_text)

    df.insert(0, "path", path)
    
    return df


references = pd.concat(map(annot_dataframe, result), ignore_index=True)

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses.fillna('', inplace=True)

In [None]:
hypotheses

In [None]:
data = pd.merge(references, hypotheses, on='path', suffixes=('_ref', '_hyp'))
data.dropna(subset=[col for col in data.columns if col.endswith('_ref')], inplace=True)
art_scores = data.apply(lambda row: cer(row['art_infringido_ref'], row['art_infringido_hyp']), axis=1)
cond_scores = data.apply(lambda row: cer(row['conducta_ref'], row['conducta_hyp']), axis=1)
codi_scores = data.apply(lambda row: cer(row['codigo_o_ley_ref'], row['codigo_o_ley_hyp']), axis=1)

print('art_infringido cer:', cer(data['art_infringido_ref'].tolist(), data['art_infringido_hyp'].tolist()))
print('conducta cer:', cer(data['conducta_ref'].tolist(), data['conducta_hyp'].tolist()))
print('codigo_o_ley cer:', cer(data['codigo_o_ley_ref'].tolist(), data['codigo_o_ley_hyp'].tolist()))


In [None]:
data

In [None]:
example = filter(lambda x: x['path'] == '/resources/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/2021/TOMO 38_JUNIO _21/3587_38 CAUSA 77325_21.pdf', test)
example = list(example)
example

## test

In [None]:
preprocess = pipeline.preprocess(test)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from jiwer import cer
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'art_infringido', 'conducta', 'codigo_o_ley']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['records']
    df = pd.DataFrame(preds)
    df['art_infringido'] = df['art_infringido'].apply(lambda x: x['text'])
    df['conducta'] = df['conducta'].apply(lambda x: x['text'])
    df['codigo_o_ley'] = df['codigo_o_ley'].apply(lambda x: x['text'])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses.fillna('', inplace=True)

In [None]:
data = pd.merge(references, hypotheses, on='path', suffixes=('_ref', '_hyp'))
art_scores = data.apply(lambda row: cer(row['art_infringido_ref'], row['art_infringido_hyp']), axis=1)
cond_scores = data.apply(lambda row: cer(row['conducta_ref'], row['conducta_hyp']), axis=1)

print('art_infringido cer:', cer(data['art_infringido_ref'].tolist(), data['art_infringido_hyp'].tolist()))
print('conducta cer:', cer(data['conducta_ref'].tolist(), data['conducta_hyp'].tolist()))
