In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, "es_AR.UTF-8")

render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)

train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

print('private', len(private))
print('public', len(public))
print('---')
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
%%export aymurai.models.dummy.fecha_resolucion

from copy import deepcopy

from aymurai.meta.types import DataItem, DataBlock
from aymurai.meta.pipeline_interfaces import TrainModule


class DummyExtractorFechaResolucion(TrainModule):
    def save(self, path: str):
        return

    def load(self, path: str):
        return

    def fit(self, train: DataBlock, val: DataBlock):
        return

    def predict(self, data: DataBlock) -> DataBlock:
        data = [self.predict_single(item) for item in data]

        return data

    def predict_single(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        # format prediction
        if "predictions" not in item:
            item["predictions"] = {}
        if "records" not in item["predictions"]:
            item["predictions"]["records"] = {}
        if "entities" not in item["predictions"]:
            item["predictions"]["entities"] = []
        if "doc-cats" not in item["predictions"]:
            item["predictions"]["doc-cats"] = {}
        item['predictions']['doc-cats']['fecha_resolucion'] = None
        item['predictions']['records']['fecha_resolucion'] = []

        ents = []
        if 'entities' in item['data']:
            ents += item['data']['entities']
        
        # if there is no entities just pass
        if not ents:
            return item 

        # 
        dates = filter(lambda x: x['label'] == 'DATE', ents)
        dates = sorted(dates, key=lambda e: e["start"])

        if not dates:
            return item 

        # get first prediction
        span = dates[0]
        dt = span["attrs"]["aymurai_date"]

        span["label"] = "FECHA_RESOLUCION"
        item["predictions"]["entities"].append(span)
        item["predictions"]["records"]["fecha_resolucion"].append(dt)
        item['predictions']['doc-cats']['fecha_resolucion'] = dt


        
        return item
        

In [None]:
import aymurai.spacy.components.loader
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.models.dummy.fecha_resolucion import DummyExtractorFechaResolucion

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    (
                        "enhanced_regex_ruler",
                        {
                            "patterns": {
                                "DATE": [
                                    "%-d/%-m/%Y",
                                    "%-d/%-m/%y",
                                    "%d/%m/%Y",
                                    "%d/%m/%y",
                                    "(?i)%-d de %B del? %Y",
                                ],
                            },
                        },
                    ),
                ],
            },
        ),
    ],
    "models": [
        (DummyExtractorFechaResolucion, {})
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed = pipeline.preprocess(train)

In [None]:
# import json

# registry = result[1]
# metadata = {k: v for k, v in registry.items() if type(v) not in [dict, list]}
# print(json.dumps(metadata, indent=4))

# print("\n-------\n")
# render(registry)


# evaluation

In [None]:
preds = pipeline.preprocess(test)
preds = pipeline.predict(preds)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from more_itertools import collapse

from aymurai.meta.types import DataItem
from sklearn.metrics import jaccard_score


def annot_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    annots = item['annotations']
    df = pd.DataFrame(annots)
    df.insert(0, 'path', path)

    df = df[['path', 'date']]
    return df

def preds_dataframe(item: DataItem) -> pd.DataFrame:
    path = item['path']
    preds = item['predictions']['doc-cats']
    df = pd.DataFrame([preds])
    df.insert(0, 'path', path)
    return df


references = pd.concat(map(annot_dataframe, test), ignore_index=True)
references['date'] = pd.to_datetime(references['date'])
references = references.rename(columns={'date': 'reference'})

hypotheses = pd.concat(map(preds_dataframe, preds), ignore_index=True)
hypotheses = hypotheses.rename(columns={'fecha_resolucion': 'hypothesis'})

df = pd.merge(references, hypotheses, on='path')
df['date-diff'] = abs(df['reference'] - df['hypothesis'])
df['exact_match'] = df['reference'] == df['hypothesis']

def jaccard_(group):
    ref = group['reference'].dt.strftime('%d/%m/%Y').to_list()
    hyp = group['hypothesis'].dt.strftime('%d/%m/%Y').to_list()
    hyp = [x if not pd.isna(x) else '' for x in hyp]
    
    return jaccard_score(ref, hyp, average='micro')
jaccard = df.groupby('path').apply(jaccard_).reset_index()
df = df.merge(jaccard, on='path')
df.rename(columns={0: 'jaccard'}, inplace=True)

print('score:')
print('exact-match', df['exact_match'].mean())
print('jaccard', df['jaccard'].mean())
display(df['date-diff'].describe())
display(df)
sns.distplot(df['date-diff'].dt.days, bins=range(10))
plt.xlim(xmin=0, xmax=30)

In [None]:
def jaccard(row):
    ref = set(row['reference'])
    hyp = set(row['hypothesis'])
    if not (ref) and not (hyp):
        return 1
    elif not (ref) or not (hyp):
        return 0
    
    return len(ref & hyp) / len(ref | hyp)

def first_match(row):
    ref = row['reference'][0]
    hyp = row['hypothesis'][0]
    return ref == hyp


def metrics(ref, hyp, column):
    ref = ref[["path", column]].copy()
    ref = ref.groupby('path').agg({column: list}).reset_index()
    ref.rename(columns={column: "reference"}, inplace=True)

    hyp = hyp[["path", column]].copy()
    hyp = hyp.groupby('path').agg({column: list}).reset_index()
    hyp.rename(columns={column: "hypothesis"}, inplace=True)

    df = pd.merge(ref[["path", "reference"]], hyp[["path", "hypothesis"]])
    df['jaccard'] = df.apply(lambda row: jaccard(row), axis=1)
    df['1st_match'] = df.apply(lambda row: first_match(row), axis=1)
    
    return df

In [None]:
import matplotlib.dates as mdates
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_rows",
    30,
    "max_colwidth",
    1000,
)
errors = df.query("exact_match == False")
# fig, ax = plt.subplots()

# sns.distplot(errors['date-diff'].dt.days)
# sns.distplot(errors['reference'])
# ax.xaxis.set_major_locator(mdates.YearLocator())
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%b\n%Y"))
errors


In [None]:
registry['data']['entities']

In [None]:
import json

path = df.loc[23, 'path']
print(path)

filtered = list(filter(lambda x: x['path'] == path, preds))
registry = filtered[0]
metadata = {k: v for k, v in registry.items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))

print("\n-------\n")
render(registry)
