In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2


In [None]:
from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

colors = {
    'SECTION:DECISION': 'red',
    'KEYWORDS': 'blue'

}
render = DocRender(config={'colors': colors})

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# pipeline definition

In [None]:
%%export aymurai.models.dummy.n_expte_eje
import re
from copy import deepcopy

from more_itertools import unique_justseen

from aymurai.meta.types import DataItem, DataBlock
from aymurai.meta.pipeline_interfaces import TrainModule


class DummyExtractorExpediente(TrainModule):
    def save(self, path: str):
        return

    def load(self, path: str):
        return

    def fit(self, train: DataBlock, val: DataBlock):
        return

    def predict(self, data: DataBlock) -> DataBlock:
        data = [self.predict_single(item) for item in data]

        return data

    def predict_single(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        # format prediction
        if "predictions" not in item:
            item["predictions"] = {}
        if "records" not in item["predictions"]:
            item["predictions"]["records"] = {}
        if "entities" not in item["predictions"]:
            item["predictions"]["entities"] = []
        if "doc-cats" not in item["predictions"]:
            item["predictions"]["doc-cats"] = {}
        item["predictions"]["doc-cats"]["n_expte_eje"] = None
        item["predictions"]["records"]["n_expte_eje"] = []

        ents = []
        if "entities" in item["data"]:
            ents += item["data"]["entities"]

        ents = filter(lambda x: x["label"] == "N_EXPTE_EJE", ents)
        ents = sorted(ents, key=lambda e: e["attrs"]["aymurai_score"])
        ents = list(ents)

        # if there is no entities just pass
        if not ents:
            return item

        parser = re.compile(r"(?P<exp>\d+)/(?P<year>\d+)(?P<code>-\d)?")
        exptes = map(lambda x: parser.search(x["text"]), ents)
        exptes = list(exptes)

        if not exptes:
            return item

        for ent, expte in zip(ents, exptes):
            char_offset = expte.span()
            subtext = expte[0]
            text = ent["text"]
            subpre = text[: char_offset[0]]
            subpost = text[char_offset[1] :]

            ent["context_pre"] += subpre
            ent["context_post"] = subpost + ent["context_post"]
            ent["end_char"] = ent["start_char"] + char_offset[1]
            ent["start_char"] += char_offset[0]

            # FIXME: should use spacy tokenizer instead split with spaces!
            tokenspre = list(filter(bool, subpre.split(" ")))
            tokenspost = list(filter(bool, subpost.split(" ")))
            ent["start"] += len(tokenspre)
            ent["end"] -= len(tokenspost)
            ent["text"] = subtext

        # get first prediction
        # span = dates[0]
        # year = span["attrs"]["aymurai_date"].strftime("%Y")

        item["predictions"]["entities"].append(ents)
        item["predictions"]["records"]["n_expte_eje"] += [ent["text"] for ent in ents]
        item["predictions"]["doc-cats"]["n_expte_eje"] = ents[0]["text"]

        return item

## Dummy classifier

In [None]:
import aymurai.spacy.components
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.models.dummy.n_expte_eje import DummyExtractorExpediente
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "N_EXPTE_EJE": [
                                    r"(?i)causa\s*(n.)?\s*\d+/%Y(-\d)?",
                                    r"(?i)causa\s*(n.)?\s*\d+/%y(-\d)?",
                                    r"(?i)caso\s*(n.)?\s*\d+/%Y(-\d)?",
                                    r"(?i)caso\s*(n.)?\s*\d+/%y(-\d)?",
                                    r'EXP:\s*\d+/%Y(-\d)?',
                                    r'EXP:\s*\d+/%y(-\d)?',
                                    r'IPP?\s*\d+/%Y(-\d)?',
                                    r'IPP?\s*\d+/%y(-\d)?'
                                ],
                            },
                        },
                    ),
                ],
            },
        ),
    ],
    "models": [
        (DummyExtractorExpediente, {})
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import json
import random

index = random.choice(range(len(result)))
index = 854
registry = result[index]
print('index', index)
print(registry['path'])
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))


print('annotations')
print('expte_eje:', [x['n_expte_eje'] for x in registry['annotations']])
print('prediction')
print(registry['predictions']['doc-cats'])


print('\n-------\n')
render(registry)

# Evaluation

## train

In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem
from jiwer import cer


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'n_expte_eje']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)
references = references.rename(columns={'n_expte_eje': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses = hypotheses.rename(columns={'n_expte_eje': 'hypothesis'})
hypotheses.fillna('', inplace=True)

raw = pd.merge(references, hypotheses, on='path')
print('raw cer:', cer(raw['reference'].tolist(), raw['hypothesis'].tolist()))
raw['cer'] = raw.apply(lambda row: cer(row['reference'], row['hypothesis']), axis=1)
raw

In [None]:
import re
from datetime import datetime

fixed = raw.copy()
fixed['reference'] = fixed['reference'].apply(lambda s: s.replace('_', '/'))

def short_to_long_years(text: str):
    parser = re.compile(r"(?P<exp>\d+)/(?P<year>\d+)(?P<code>-\d)?")
    matches = parser.findall(text)
    if not matches:
        return text
    matches = matches[0]
    expt, year, code = matches
    if len(year) == 2:
        year = datetime.strptime(year, '%y').year
    
    code = '' if code == '-0' else code
    return f'{expt}/{year}{code}'

fixed['hypothesis'] = fixed['hypothesis'].apply(short_to_long_years)
print('fixed cer:', cer(fixed['reference'].tolist(), fixed['hypothesis'].tolist()))
fixed['cer'] = fixed.apply(lambda row: cer(row['reference'], row['hypothesis']), axis=1)
fixed

## test

In [None]:
preprocess = pipeline.preprocess(test)
result = pipeline.predict(preprocess)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse
from sklearn.metrics import classification_report

from aymurai.meta.types import DataItem
from jiwer import cer


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'n_expte_eje']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df

references = pd.concat(map(annot_dataframe, result), ignore_index=True)
references = references.rename(columns={'n_expte_eje': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, result), ignore_index=True)
hypotheses = hypotheses.rename(columns={'n_expte_eje': 'hypothesis'})
hypotheses.fillna('', inplace=True)

raw = pd.merge(references, hypotheses, on='path')
print('raw cer:', cer(raw['reference'].tolist(), raw['hypothesis'].tolist()))
raw['cer'] = raw.apply(lambda row: cer(row['reference'], row['hypothesis']), axis=1)
raw

In [None]:
import re
from datetime import datetime

fixed = raw.copy()
fixed['reference'] = fixed['reference'].apply(lambda s: s.replace('_', '/'))

def short_to_long_years(text: str):
    parser = re.compile(r"(?P<exp>\d+)/(?P<year>\d+)(?P<code>-\d)?")
    matches = parser.findall(text)
    if not matches:
        return text
    matches = matches[0]
    expt, year, code = matches
    if len(year) == 2:
        year = datetime.strptime(year, '%y').year
    
    code = '' if code == '-0' else code
    return f'{expt}/{year}{code}'

fixed['hypothesis'] = fixed['hypothesis'].apply(short_to_long_years)
print('fixed cer:', cer(fixed['reference'].tolist(), fixed['hypothesis'].tolist()))
fixed['cer'] = fixed.apply(lambda row: cer(row['reference'], row['hypothesis']), axis=1)
fixed['exact-match'] = fixed['reference'] == fixed['hypothesis']
print('exact-match', fixed['exact-match'].mean())
fixed