In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, "es_AR.UTF-8")

render = DocRender()

In [None]:
def is_oral(item):
    annots = item['annotations']
    is_oral = map(lambda x: x['oral_escrita'] == 'oral', annots)
    return any(is_oral)

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

# train = list(filter(is_oral, train))
# val = list(filter(is_oral, val))
# test = list(filter(is_oral, test))

print('private', len(private))
print('public', len(public))
print('---')
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# pipeline definition

## dummy extractor

In [None]:
%%export aymurai.models.dummy.hora_inicio_cierre

from copy import deepcopy

import spacy
import pandas as pd
from more_itertools import zip_offset

from aymurai.meta.types import DataItem, DataBlock
from aymurai.spacy.components.fuzzy import FuzzyMatcher
from aymurai.meta.pipeline_interfaces import TrainModule


class DummyExtractorHoraInicioCierre(TrainModule):
    def __init__(self, lang: str = "es"):
        self.nlp = spacy.blank("es")
        self.matcher = FuzzyMatcher(self.nlp.vocab)
        self.matcher.add("START", patterns=[self.nlp.make_doc(t) for t in ["inicio"]])
        self.matcher.add(
            "END",
            patterns=[
                self.nlp.make_doc(t) for t in ["cierre", "finalizacion", "finalización"]
            ],
        )

    def save(self, path: str):
        return

    def load(self, path: str):
        return

    def fit(self, train: DataBlock, val: DataBlock):
        return

    def predict(self, data: DataBlock) -> DataBlock:
        data = [self.predict_single(item) for item in data]

        return data

    def predict_single(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        # format prediction
        if "predictions" not in item:
            item["predictions"] = {}
        if "records" not in item["predictions"]:
            item["predictions"]["records"] = {}
        if "entities" not in item["predictions"]:
            item["predictions"]["entities"] = []
        if "doc-cats" not in item["predictions"]:
            item["predictions"]["doc-cats"] = {}

        item["predictions"]["records"]["hora_de_inicio"] = []
        item["predictions"]["records"]["hora_de_cierre"] = []
        item["predictions"]["records"]["duracion"] = []
        item["predictions"]["doc-cats"]["oral_escrita"] = "escrita"

        ents = []
        if "entities" in item["data"]:
            ents += item["data"]["entities"]

        # if there is no entities just pass
        if not ents:
            return item

        times = filter(lambda x: x["label"] == "TIME", ents)
        times = sorted(times, key=lambda e: e["start"])

        if not times:
            return item

        starts = ends = 0
        for span in times:
            pre = span["context_pre"]
            dt = span["attrs"]["aymurai_date"]

            matches = self.matcher(self.nlp.make_doc(pre))
            matches = sorted(matches, key=lambda x: x[3])
            if not matches:
                continue

            candidate = matches[0]
            match candidate[0]:
                case "START":
                    span["label"] = "HORA_INICIO"
                    item["predictions"]["entities"].append(span)
                    item["predictions"]["records"]["hora_de_inicio"].append(dt)
                    item["predictions"]["doc-cats"]["oral_escrita"] = "oral"
                    starts += 1
                case "END":
                    span["label"] = "HORA_CIERRE"
                    item["predictions"]["entities"].append(span)
                    item["predictions"]["records"]["hora_de_cierre"].append(dt)
                    item["predictions"]["doc-cats"]["oral_escrita"] = "oral"
                    ends += 1

        pairs = max(starts, ends)
        item["predictions"]["records"]["hora_de_inicio"] = [
            time
            for i, time in zip_offset(
                range(pairs),
                item["predictions"]["records"]["hora_de_inicio"],
                offsets=(0, 0),
                longest=True,
                fillvalue=pd.NaT,
            )
        ]
        item["predictions"]["records"]["hora_de_cierre"] = [
            time
            for i, time in zip_offset(
                range(pairs),
                item["predictions"]["records"]["hora_de_cierre"],
                offsets=(0, 0),
                longest=True,
                fillvalue=pd.NaT,
            )
        ]
        for start, end in zip(
            item["predictions"]["records"]["hora_de_inicio"],
            item["predictions"]["records"]["hora_de_cierre"],
        ):
            item["predictions"]["records"]["duracion"] += [end - start]

        return item

In [None]:
import aymurai.spacy.components.loader
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.models.dummy.hora_inicio_cierre import DummyExtractorHoraInicioCierre

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "TIME": [
                                    "%H(\.|:)%M",
                                    "%-H(.|:)%M (?i)horas",
                                    "%-H.%M h(rs|r|s)\.?",
                                ],
                            },
                        },
                    ),
                ],
            },
        ),
    ],
    "models": [
        (DummyExtractorHoraInicioCierre, {})
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preds = pipeline.preprocess(train)
preds = pipeline.predict(preds)

# visualization

In [None]:
import spacy
import srsly

registry = preds[21]
metadata = {k: v for k, v in registry.items() if type(v) not in [dict, list]}
# print(
#     srsly.yaml_dumps(
#         metadata, indent_mapping=2, indent_sequence=4, indent_offset=2, sort_keys=False
#     )
# )
print(
    # srsly.yaml_dumps(
        registry["predictions"]["records"],
    #     indent_mapping=2,
    #     indent_sequence=4,
    #     indent_offset=2,
    #     sort_keys=False,
    # )
)
print()

print("\n-------\n")
render(registry)

# evaluation

In [None]:
preprocessed = pipeline.preprocess(test)
preds = pipeline.predict(preprocessed)

In [None]:
len(preds)

## references

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse

from aymurai.meta.types import DataItem


def format_duration(value):
    if not isinstance(value, str):
        return value
    # add seconds count
    return value + ':00'

def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    annots = item["annotations"]
    df = pd.DataFrame(annots)
    df.insert(0, "path", path)

    # df = df[['path', 'date']]
    df = df[["path", "oral_escrita", "hora_de_inicio", "hora_de_cierre", "duracion"]]
    return df

references = pd.concat(map(annot_dataframe, preds), ignore_index=True)
references['hora_de_inicio'] = pd.to_datetime(references['hora_de_inicio']).apply(lambda x: x.replace(year=1900, month=1, day=1))
references['hora_de_cierre'] = pd.to_datetime(references['hora_de_cierre']).apply(lambda x: x.replace(year=1900, month=1, day=1))
references['duracion'] = references['duracion'].apply(format_duration)
references['duracion'] = pd.to_timedelta(references['duracion'])
references['oral_escrita'] = references['oral_escrita'].astype('category')

# references.drop_duplicates(subset=['path'], keep='first', inplace=True)
references.set_index('path', inplace=True)
references

## hypotheses

In [None]:

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item["path"]
    preds_ = item["predictions"]["records"]
    df = pd.DataFrame(preds_)
    df.insert(0, "path", path)
    if not len(df):
        df = pd.DataFrame({'path': path}, index=pd.Index([0]))
    
    oral_escrita = item['predictions']['doc-cats']['oral_escrita']
    df['oral_escrita'] = oral_escrita
    return df


hypotheses = pd.concat(map(preds_dataframe, preds), ignore_index=True)
hypotheses['oral_escrita'] = hypotheses['oral_escrita'].astype('category')

hypotheses.drop_duplicates(subset=['path'], keep='first', inplace=True)
hypotheses.set_index('path', inplace=True)
hypotheses



## metrics

In [None]:
from sklearn.metrics import classification_report

refs = references.reset_index().drop_duplicates(subset=['path'], keep='first')[['path', 'oral_escrita']]
hyps = hypotheses.reset_index().drop_duplicates(subset=['path'], keep='first')[['path', 'oral_escrita']]
hyps.rename(columns={'oral_escrita': 'hypothesis'}, inplace=True)
refs.rename(columns={'oral_escrita': 'reference'}, inplace=True)

df = pd.merge(refs, hyps, on='path')
df['acc'] = df['reference'] == df['hypothesis']

report = classification_report(df['reference'], df['hypothesis'])
print(report)
df

In [None]:
def jaccard(row):
    ref = set(row['reference'])
    hyp = set(row['hypothesis'])
    if not (ref) and not (hyp):
        return 1
    elif not (ref) or not (hyp):
        return 0
    
    return len(ref & hyp) / len(ref | hyp)

def first_match(row):
    ref = row['reference'][0]
    hyp = row['hypothesis'][0]
    return ref == hyp


def metrics(ref, hyp, column):
    ref = ref[["path", column]].copy()
    ref = ref.groupby('path').agg({column: list}).reset_index()
    ref.rename(columns={column: "reference"}, inplace=True)

    hyp = hyp[["path", column]].copy()
    hyp = hyp.groupby('path').agg({column: list}).reset_index()
    hyp.rename(columns={column: "hypothesis"}, inplace=True)

    df = pd.merge(ref[["path", "reference"]], hyp[["path", "hypothesis"]])
    df['jaccard'] = df.apply(lambda row: jaccard(row), axis=1)
    df['1st_match'] = df.apply(lambda row: first_match(row), axis=1)
    
    return df

In [None]:
refs = references.reset_index()
refs = refs.query('oral_escrita == "oral"')
hyps = hypotheses.reset_index()
df = metrics(refs, hyps, 'hora_de_inicio')
print('jaccard:', df['jaccard'].mean())
print('1st_match acc:', df['1st_match'].mean())
display(df)

In [None]:
df.loc[0, 'path']

In [None]:
example = filter(lambda x: x['path'] == '/resources/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/2018/2) FEBRERO/1045_9_20170_8_02_18_1.1.1_L_451_alimentos_en_infraccion.pdf', test)
example = list(example)
example

In [None]:
refs = references.reset_index()
refs = refs.query('oral_escrita == "oral"')
hyps = hypotheses.reset_index()
df = metrics(refs, hyps, 'hora_de_cierre')
print('jaccard:', df['jaccard'].mean())
print('1st_match acc:', df['1st_match'].mean())
display(df)

In [None]:
refs = references.reset_index()
refs = refs.query('oral_escrita == "oral"')
hyps = hypotheses.reset_index()
df = metrics(refs, hyps, 'duracion')
print('jaccard:', df['jaccard'].mean())
print('1st_match acc:', df['1st_match'].mean())
display(df)

In [None]:
refs = references.reset_index()
refs = refs.query('oral_escrita == "oral"')
hyps = hypotheses.reset_index()
df = metrics(refs, hyps, 'hora_de_cierre')
print('jaccard:', df['jaccard'].mean())
print('1st_match acc:', df['1st_match'].mean())
display(df)

In [None]:
import json

# path = df.loc[23, 'path']
# print(path)
path = errors.index[10]

filtered = list(filter(lambda x: x['path'] == path, preds))
registry = filtered[0]
metadata = {k: v for k, v in registry['metadata'].items() if type(v) not in [dict, list]}
print(registry['path'])
print(json.dumps(metadata, indent=4))

print('annotations')
print('hora inicio:', [x['hora_de_inicio'] for x in registry['annotations']])
print('hora cierre:', [x['hora_de_cierre'] for x in registry['annotations']])
print('predictions')
# print('hora inicio:', registry['predictions']['records']['hora_de_inicio'][0].strftime('%H:%M'))
# print('hora cierre:', registry['predictions']['records']['hora_de_cierre'][0].strftime('%H:%M'))


print("\n-------\n")
render(registry)
