In [None]:
%load_ext aymurai.devtools.magic
%load_ext autoreload
%autoreload 2

In [None]:
%%export aymurai.datasets.ar_juz_pcyf_10.labelstudio.utils

import re
import spacy

from glob import glob
from numpy import cumsum
from copy import deepcopy
from itertools import groupby
from more_itertools import unzip, collapse

from aymurai.meta.types import DataItem
from aymurai.spacy.utils import format_entity
from aymurai.utils.json_data import load_json

nlp = spacy.blank("es")


def join_label_category(spans: list[dict]) -> dict:
    """join entity & entity-category (labelstudio) on one object"""
    span = {}
    for s in spans:
        span.update(s)
    return span


def reformat_entity(doc: spacy.tokens.Doc, span: dict) -> dict:
    """
    reformat labelstudio entity to aymurai

    Args:
        doc (spacy.tokens.Doc): pointer to document spacy.Doc
        span (dict): span to reformat

    Returns:
        dict: aymurai formatted entity
    """
    entity = doc.char_span(
        span["start"], span["end"], label=span["labels"][0], alignment_mode="expand"
    )
    entity = format_entity(entity)
    entity["attrs"]["aymurai_label"] = entity["label"]
    if "choices" in span:
        entity["attrs"]["aymurai_label_subclass"] = span["choices"]
    return entity


def parse_annots(data: dict) -> dict:
    """
    parse annotations from labelstudio-data

    Args:
        data (dict): labelstudio-json export

    Returns:
        dict: categories & entities (aymurai format)
    """
    doc = nlp(data["data"]["text"])

    annotations = data["annotations"][0]["result"]
    annotations = list(annotations)

    # filter span entities
    spans = filter(lambda x: x["type"] in ["labels", "choices"], annotations)
    spans = map(lambda x: x["value"], spans)
    spans = sorted(spans, key=lambda x: x["start"])

    _, group = unzip(groupby(spans, key=lambda x: (x["start"], x["end"], x["text"])))
    group = map(join_label_category, group)

    spans = list(group)
    spans = map(lambda span: reformat_entity(doc, span), spans)
    spans = list(spans)

    # categories
    categories = filter(lambda x: x.get("type") == "textarea", annotations)
    categories = {cat["from_name"]: cat["value"]["text"][0] for cat in categories}

    return {
        "categories": categories,
        "entities": spans,
    }


def load_conll_annots(basepath: str) -> list[str]:
    """
    load annotations (CoNLL format)

    Args:
        basepath (dict): path where to look the annotations (conll file)

    Returns:
        list[str]: list containing the annotations of different docs
    """
    # read annotations
    conll_path = glob(f"{basepath}/*.conll")[0]
    with open(conll_path) as file:
        annotations = file.read()

    # remove header
    annotations = annotations.replace("-DOCSTART- -X- O\n", "")

    # remove unuseful tags
    annotations = annotations.replace(" -X- _", "")

    # split annotations corresponding to different documents
    annotations = annotations.split("\n\n")

    # pop empty element
    if "" in annotations:
        _ = annotations.pop(annotations.index(""))

    return annotations


def parse_conll_annots(item: DataItem, annotation: str) -> DataItem:
    """
    parse CoNLL annotations and document to split the text by paragraphs

    Args:
        item (DataItem): aymurai dataitem
        annotation (str): CoNLL annotation corresponding to item

    Returns:
        DataItem: aymurai dataitem with CoNLL annotation
    """
    item = deepcopy(item)

    # document text
    doc = item["data"]["doc.text"]

    # NOTE this should be donde as part of the preprocessing
    # replace '\t' and '\xa0' for white space
    doc = re.sub(r"(?:\t|\xa0)+", " ", doc)

    # remove multiple spaces except new lines
    doc = re.sub(r"[^\S\r\n]+", " ", doc)

    # replace multiple new lines with just one break
    doc = re.sub(r"\n+", "\n", doc)

    # split document by line
    splitted_doc = doc.splitlines()

    # number of tokens per line
    n_tokens = [len(line.split()) for line in splitted_doc]

    # indexes where a new line character must be inserted to separate paragraph
    idx = [idx + i for i, idx in enumerate(cumsum(n_tokens))]

    # split annotations by line
    splitted_annotation = annotation.splitlines()

    # insert new line character where needed
    for i in idx:
        splitted_annotation.insert(i, "\n")

    # join the new annotations
    joined_annotation = "\n".join(splitted_annotation)
    joined_annotation = re.sub("\n{3,}", "\n\n", joined_annotation)

    # add CoNLL annotation to dataitem
    item["annotations"]["conll"] = joined_annotation

    return item


def annotation_to_dataitem(annotation: dict) -> DataItem:
    """
    format a whole labelstudio document into the aymurai format

    Args:
        annotation (dict): labelstudio document

    Returns:
        DataItem: aymurai dataitem
    """
    item = {}
    item["path"] = annotation["data"]["meta_info"]["path"]
    item["data"] = {"doc.text": annotation["data"]["text"]}
    annots = parse_annots(annotation)
    item["metadata"] = annots["categories"]
    item["annotations"] = {"entities": annots["entities"]}
    return item


def load_annotations(basepath: str) -> list[DataItem]:
    """
    load all annotations from `basepath`. this directory must contain
    the annotations both in json and conll formats.
    internally, use glob to look for all the annotation files files inside `basepath`.

    Args:
        basepath (str): path where to look for the annotations (json and conll files)

    Returns:
        list[DataItem]: list of dataitems (aymurai format)
    """
    paths = glob(f"{basepath}/*.json")
    items = map(load_json, paths)
    items = collapse(items, base_type=dict)
    items = map(annotation_to_dataitem, items)
    coll_annots = load_conll_annots(basepath)
    items = map(parse_conll_annots, items, coll_annots)
    return list(items)


In [None]:
%%export aymurai.datasets.ar_juz_pcyf_10.annotations

from collections import UserList

from aymurai.datasets.ar_juz_pcyf_10.labelstudio.utils import load_annotations


class ArgentinaJuzgadoPCyF10LabelStudioAnnotations(UserList):
    def __init__(self, basepath: str):
        self.data = load_annotations(basepath)

# Testing

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import ArgentinaJuzgadoPCyF10LabelStudioAnnotations

In [None]:
dataset = ArgentinaJuzgadoPCyF10LabelStudioAnnotations('/test/api/mock-response/input')

In [None]:
%%export aymurai.transforms.misc.annot2pred

from copy import deepcopy

from aymurai.meta.types import DataItem
from aymurai.meta.pipeline_interfaces import Transform


class DummyAnnotToPred(Transform):
    """dummy transform to convert annotations into predictions"""
    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        if 'annotations' not in item:
            return item
        item["predictions"] = {"entities": item["annotations"]["entities"]}
        return item

In [None]:
from aymurai.pipeline import AymurAIPipeline
from aymurai.transforms.entities import FilterEntity
from aymurai.transforms.misc.annot2pred import DummyAnnotToPred
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import TextNormalize

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "use_cache": False,
            },
        ),
        (TextNormalize, {}),
        (DummyAnnotToPred, {}),
        ],
    "models": [],
    "postprocess": [
        (
            FilterEntity,
            {
                # we skip DECISION because we nown that overlap with other entities
                "entities": ["DECISION"],
            },
        )
    ],
    "multiprocessing": {},
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
pipeline.save('/resources/pipelines/examples/dummy-annot2pred')

In [None]:
pipeline = AymurAIPipeline.load('/resources/pipelines/examples/dummy-annot2pred')

In [None]:
processed = pipeline.preprocess(dataset)
processed = pipeline.predict(processed)
processed = pipeline.postprocess(processed)

In [None]:
from aymurai.utils.display import DocRender
render = DocRender()

render(processed[0])