In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2


In [None]:
from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

colors = {
    'SECTION:DECISION': 'red',
    'SECTION:HEAD': 'red',
    'KEYWORDS': 'blue'

}
render = DocRender(config={'colors': colors})

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
docs = ArgentinaJuzgadoPCyF10Dataset('private-docs', use_cache=True)
docs = filter(lambda x: 'admisibilidad_prueba' not in x['metadata']['objeto_de_la_resolucion'], docs)
docs = list(docs)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# pipeline definition

In [None]:
# # %%export aymurai.spacy.components.es_ar.art_infringido
# import re
# import hashlib
# from copy import deepcopy
# 
# import spacy
# from spacy.tokens import Span
# from more_itertools import unique_everseen
# 
# from aymurai.spacy.utils import format_entity
# from aymurai.meta.pipeline_interfaces import Transform
# from aymurai.spacy.components.regex import EnhancedRegexRuler, EnhancedRegexMatcher
# from aymurai.spacy.components.es_ar.articles.patterns import (
#     ABBRS,
#     CODES,
#     ART_PATTERN_MULTI_MOD,
#     ART_PATTERN_MULTI_PREFIX,
# )
# 
# 
# class SpacyRulerArtInfringido(Transform):
#     def __init__(self):
#         global __nlp
#         __nlp = spacy.blank("es")
#         self.matcher_layer_0 = EnhancedRegexMatcher(__nlp.vocab)
#         self.matcher_layer_1 = EnhancedRegexMatcher(__nlp.vocab)
#         self.matcher_layer_0.add(
#             "LAYER_0", patterns=[f"{self.LAYER0_PREFIX}.*?{self.LAYER0_SUFFIX}"]
#         )
#         self.matcher_layer_1.add(
#             "ART_INFRINGIDO",
#             patterns=[
#                 r"[\d\.]{2,}",
#                 ART_PATTERN_MULTI_PREFIX,
#                 ART_PATTERN_MULTI_MOD,
#             ],
#         )
#         # self.matcher_layer_1.add(
#         #     "CONDUCTA1",
#         #     patterns=[
#         #         f"{self.PAT_COND1_PREFIX}.*?{self.PAT_COND1_SUFFIX}",
#         #     ],
#         # )
#         self.matcher_layer_1.add("CODIGO_O_LEY", patterns=CODES)
#         self.matcher_layer_1.add(
#             "CODIGO_O_LEY", patterns=[f"{abbr}(\s?CABA)?" for abbr in ABBRS]
#         )
#         self.matcher_layer_1.add(
#             "CODIGO_O_LEY", patterns=["(?i)ley(es)?(( y|,)? ([\d\.]+))+"]
#         )
# 
#     def clean_span(self, span):
#         match span.label_:
#             case "CONDUCTA":
#                 pre = __nlp.make_doc(re.sub(f"^{self.PAT_COND1_PREFIX}", "", span.text))
#                 post = __nlp.make_doc(
#                     re.sub(f"{self.PAT_COND1_SUFFIX}$", "", span.text)
#                 )
#                 start = len(span) - len(pre)
#                 end = len(span) - len(post)
#                 span = Span(
#                     span.doc,
#                     start=span.start + start,
#                     end=span.end - end,
#                     label="CONDUCTA",
#                 )
# 
#         return span
# 
#     def __call__(self, item):
#         item = deepcopy(item)
#         if not "entities" in item["data"]:
#             item["data"]["entities"] = []
# 
#         fragment = item["data"]["doc.text"][:700]
#         doc = __nlp(fragment)
# 
#         matches = []
#         matches_layer_0 = self.matcher_layer_0(doc)
#         for label0, start0, end0, score0 in matches_layer_0:
#             span = doc[start0:end0]
#             doc0 = span.as_doc()
#             
#             matches_layer_1 = self.matcher_layer_1(doc0)
#             for label1, start1, end1, score1 in matches_layer_1:
#                 start1 += start0
#                 end1 += start0
#                 matches += [(label1, start1, end1, score1)]
#                 if label1 == "ART_INFRINGIDO":
#                     matches += [
#                         ("CONDUCTA", end1 + 1, start0 + len(doc0) + 1, (0, 0, 0))
#                     ]
# 
#         matches = sorted(matches, key=lambda x: (sum(x[3]), x[1]))
#         matches = unique_everseen(matches, key=lambda x: x[0])
#         matches = list(matches)
# 
#         for label, start, end, score in matches:
#             span = Span(doc, start=start, end=end, label=label)
#             span = self.clean_span(span)
#             item["data"]["entities"] += [format_entity(span)]
# 
#         # item["data"]["doc.text"] = fulltext
# 
#         # item["data"]["entities"] = [
#         #     self.clean_art(ent) for ent in item["data"]["entities"]
#         # ]
# 
#         return item

In [None]:
import re
import hashlib
from copy import deepcopy
from itertools import groupby, filterfalse, zip_longest

import regex
import spacy
import srsly
import pandas as pd
from spacy.tokens import Span
from more_itertools import flatten, collapse, unique_everseen

from aymurai.meta.types import DataItem
from aymurai.spacy.utils import format_entity
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.meta.pipeline_interfaces import Transform
from aymurai.spacy.components.regex import EnhancedRegexRuler, EnhancedRegexMatcher
from aymurai.spacy.components.es_ar.articles.patterns import (
    ABBRS,
    CODES,
    ART_CODE,
    ART_PREFIX,
    ART_PREFIX_TITLE,
    ART_PATTERN_MULTI_MOD,
    ART_PATTERN_MULTI_PREFIX,
    ART_PATTERN_MULTI_NO_PREFIX,
    ART_PATTERN_MULTI_PREFIX_TITLE,
)

VALIDATION_FIELDS = pd.read_csv("validacion_codigos.csv")
VALIDATION_FIELDS.rename(
    columns={col: col.lower() for col in VALIDATION_FIELDS.columns},
    inplace=True,
)


class SpacyRulerArtInfringidoValFields(Transform):
    def __init__(self):
        FIELDS = ["conducta", "conducta_descripcion"]
        self.VALIDATION_FIELDS = {
            k: v for k, v in VALIDATION_FIELDS.items() if k in FIELDS
        }

        patterns = {
            "ARTICLE": [
                ART_PATTERN_MULTI_PREFIX_TITLE,
                ART_PATTERN_MULTI_NO_PREFIX,
                ART_PATTERN_MULTI_PREFIX,
                ART_PATTERN_MULTI_MOD,
            ],
        }

        for field, validations in self.VALIDATION_FIELDS.items():
            validations = filter(lambda x: isinstance(x, str), validations)
            validations = map(lambda x: x.replace("_", "[_\s]+"), validations)
            # validations = map(lambda x: x.replace('a', '[áa]'), validations)
            # validations = map(lambda x: x.replace('e', '[ée]'), validations)
            # validations = map(lambda x: x.replace('i', '[íi]'), validations)
            # validations = map(lambda x: x.replace('o', '[óo]'), validations)
            # validations = map(lambda x: x.replace('u', '[úu]'), validations)
            validations = map(lambda x: f"(?i){x}{{e<={int(0.2*len(x))}}}", validations)
            validations = list(validations)
            # validations += [v.replace("_", "[_\s]+") for v in validations]
            patterns[field.upper()] = validations

        self.ruler = SpacyRulerPipeline(
            base="es",
            steps=[
                (
                    "enhanced_regex_ruler",
                    {
                        "patterns": patterns,
                    },
                ),
            ],
        )

 

    def postprocess(self, entity):
        entity = deepcopy(entity)
        text = entity["text"]
        span = self.ruler.nlp.make_doc(text)

        # i = (list(g) for _, g in groupby(text, key='y'.__ne__))
        # print([a + b for a, b in zip_longest(i, i, fillvalue=[])])

        prefix = regex.compile(f"(?i)^({ART_PREFIX_TITLE})|({ART_PREFIX})\.?\s*")
        art_pat = regex.compile(f"{ART_CODE}|\d+")
        match = prefix.match(text)
        if match:
            span = match[0]
            pre = [t for t in self.ruler.nlp.make_doc(span)]
            entity["start"] += len(pre)
            entity["text"] = prefix.sub("", text).strip()
            entity["start_char"] += len(span)

            # split string en 'y'
            # text = entity['text']
            # i = (list(g) for _, g in groupby(text, key='y'.__ne__))
            # print([''.join(a + b) for a, b in zip_longest(i, i, fillvalue=[])])

        # print(srsly.yaml_dumps(entity))
        return [entity]

    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        item = self.ruler(item)

        ents = []
        if "entities" in item["data"]:
            ents += item["data"]["entities"]

        # take articles from ents to work with
        arts = filter(lambda x: x["label"] == "ARTICLE", ents)
        ents = filter(lambda x: x["label"] != "ARTICLE", ents)
        ents = list(ents)

        # post process entities
        arts = map(self.postprocess, arts)
        arts = collapse(arts, base_type=dict)
        arts = list(arts)

        # restore entities
        ents += arts
        ents = sorted(ents, key=lambda x: x["start"])
        item["data"]["entities"] = list(ents)
        # print(srsly.yaml_dumps(arts))
        return item

In [None]:
# asd = SpacyRulerArtInfringidoValFields()
# a = asd(registry)
# render(a)

## Dummy classifier

In [None]:
# %%export aymurai.models.dummy.art_infringido

from copy import deepcopy
from functools import reduce

import regex
import srsly
from more_itertools import zip_offset

from aymurai.utils.misc import get_element
from aymurai.meta.types import DataItem, DataBlock
from aymurai.meta.pipeline_interfaces import TrainModule

VALIDATION_FIELDS = ArgentinaJuzgadoPCyF10Dataset("validation-fields", use_cache=False).data
VALIDATION_FIELDS = pd.DataFrame(VALIDATION_FIELDS['article_group'])


class DummyExtractorArtInfringido(TrainModule):
    def __init__(self):
        self.articles = VALIDATION_FIELDS['art_infringido']
        
    def save(self, path: str):
        return

    def load(self, path: str):
        return

    def fit(self, train: DataBlock, val: DataBlock):
        return

    def predict(self, data: DataBlock) -> DataBlock:
        data = [self.predict_single(item) for item in data]

        return data
    
    def set_attrs(self, entity):
        entity = deepcopy(entity)

        # if 
        return entity

    def predict_single(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        # format prediction
        if "predictions" not in item:
            item["predictions"] = {}
        if "records" not in item["predictions"]:
            item["predictions"]["records"] = {}
        if "entities" not in item["predictions"]:
            item["predictions"]["entities"] = []
        if "doc-cats" not in item["predictions"]:
            item["predictions"]["doc-cats"] = {}
        item["predictions"]["records"]["art_infringido"] = []
        item["predictions"]["records"]["conducta"] = []
        item["predictions"]["records"]["codigo_o_ley"] = []

        ents = []
        if "entities" in item["data"]:
            ents += item["data"]["entities"]

        # ents = sorted(ents, key=lambda e: e["start"])
        # arts = filter(lambda x: x["label"] in ["ART_INFRINGIDO"], ents)
        # cond = filter(lambda x: x["label"] in ["CONDUCTA"], ents)
        # codi = filter(lambda x: x["label"] in ["CODIGO_O_LEY"], ents)

        # if there is no entities just pass
        if not ents:
            return item

        # take articles from ents to work with
        # arts = filter(lambda x: x["label"] == "ARTICLE", ents)
        # ents = filter(lambda x: x["label"] != "ARTICLE", ents)
        # ents = list(ents)

        # arts = map(self.set_attrs, arts)
        # arts = list(arts)


        # ents += arts
        # ents = sorted(ents, key=lambda x: x["start"])
        # # print(srsly.yaml_dumps(arts))


        # item["predictions"]["entities"] = ents
        # item["predictions"]["records"]["art_infringido"] += list(arts)
        # item["predictions"]["records"]["conducta"] += list(cond)
        # item["predictions"]["records"]["codigo_o_ley"] += list(codi)

        return item

In [None]:
import aymurai.spacy.components.loader
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import JunkCleaner, TextNormalize
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                # "extension": "pdf",
                # "method": "tesseract",
                # "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            JunkCleaner,
            {
                "patterns": [
                    "Juzgado PCyF N* 10 - Tacuarí 138, 7* Piso - juzcyf10ejusbaires.gob.ar - 4014-6821/20 - Gipcyf10",
                ]
            },
        ),
        (
            SpacyRulerArtInfringidoValFields,
            {},
        ),
    ],
    "models": [
        # (DummyExtractorArtInfringido, {})
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocess = pipeline.preprocess(train[:40])
result = pipeline.predict(preprocess)

In [None]:
import json
import random

index = 7
index = random.choice(range(len(result)))
print(index)
registry = result[index]


metadata = {
    k: v for k, v in registry["metadata"].items() if type(v) not in [dict, list]
}
print(json.dumps(metadata, indent=4))


print("annotations")
print("art infingido:", [x["art_infringido"] for x in registry["annotations"]])
print("codigo:", [x["codigo_o_ley"] for x in registry["annotations"]])
print("conducta:", [x["conducta"] for x in registry["annotations"]])
print("conducta detalle:", [x["conducta_descripcion"] for x in registry["annotations"]])
# print('prediction')
# print(registry['predictions'])
# print('art_infringido:', registry['predictions']['records']['art_infringido'])
# print('conducta:', registry['predictions']['records']['conducta'])


print("\n-------\n")
# render(registry, 'span', spans_key='section', paragraphs=15)
render(registry, "ent")


# Evaluation

## train

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
from itertools import zip_longest

import numpy as np
import pandas as pd
import seaborn as sns
from jiwer import cer
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem

EMPTY_ENTITY = {
    key: None for key in ["text", "start", "end", "label", "start_char", "end_char"]
}


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    annots = item["annotations"]
    df = pd.DataFrame(annots)
    df.insert(0, "path", path)

    df = df[["path", "art_infringido", "conducta", "codigo_o_ley"]]
    return df

def get_text(value):
    if not isinstance(value, dict):
        return value
    return value.get('text', '')

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    records = item["predictions"]["records"]
    max_ = max(map(lambda x: len(x), records.values()))
    if max_ > 1:
        print(records)
    if not max_:
        return pd.DataFrame({"path": path}, index=pd.Index([0]))

    for key, record in records.items():
        records[key] = [val for val, _ in zip_longest(record, range(max_))]

    df = pd.DataFrame(records)

    df["art_infringido"] = df["art_infringido"].apply(get_text)
    df["conducta"] = df["conducta"].apply(get_text)
    df["codigo_o_ley"] = df["codigo_o_ley"].apply(get_text)

    df.insert(0, "path", path)
    
    return df


references = pd.concat(map(annot_dataframe, result), ignore_index=True)

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses.fillna('', inplace=True)

In [None]:
hypotheses

In [None]:
data = pd.merge(references, hypotheses, on='path', suffixes=('_ref', '_hyp'))
data.dropna(subset=[col for col in data.columns if col.endswith('_ref')], inplace=True)
art_scores = data.apply(lambda row: cer(row['art_infringido_ref'], row['art_infringido_hyp']), axis=1)
cond_scores = data.apply(lambda row: cer(row['conducta_ref'], row['conducta_hyp']), axis=1)
codi_scores = data.apply(lambda row: cer(row['codigo_o_ley_ref'], row['codigo_o_ley_hyp']), axis=1)

print('art_infringido cer:', cer(data['art_infringido_ref'].tolist(), data['art_infringido_hyp'].tolist()))
print('conducta cer:', cer(data['conducta_ref'].tolist(), data['conducta_hyp'].tolist()))
print('codigo_o_ley cer:', cer(data['codigo_o_ley_ref'].tolist(), data['codigo_o_ley_hyp'].tolist()))


In [None]:
data

In [None]:
example = filter(lambda x: x['path'] == '/resources/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/2021/TOMO 38_JUNIO _21/3587_38 CAUSA 77325_21.pdf', test)
example = list(example)
example

## test

In [None]:
preprocess = pipeline.preprocess(test)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from jiwer import cer
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'art_infringido', 'conducta', 'codigo_o_ley']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['records']
    df = pd.DataFrame(preds)
    df['art_infringido'] = df['art_infringido'].apply(lambda x: x['text'])
    df['conducta'] = df['conducta'].apply(lambda x: x['text'])
    df['codigo_o_ley'] = df['codigo_o_ley'].apply(lambda x: x['text'])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses.fillna('', inplace=True)

In [None]:
data = pd.merge(references, hypotheses, on='path', suffixes=('_ref', '_hyp'))
art_scores = data.apply(lambda row: cer(row['art_infringido_ref'], row['art_infringido_hyp']), axis=1)
cond_scores = data.apply(lambda row: cer(row['conducta_ref'], row['conducta_hyp']), axis=1)

print('art_infringido cer:', cer(data['art_infringido_ref'].tolist(), data['art_infringido_hyp'].tolist()))
print('conducta cer:', cer(data['conducta_ref'].tolist(), data['conducta_hyp'].tolist()))
