In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

In [None]:
IA2_LABELS = pd.read_csv(
    "/resources/data/restricted/anonymization/ia2_labels_manual.csv"
)
IA2_LABELS_MAPPING = {x["label"]: x["mapping"] for _, x in IA2_LABELS.iterrows()}
data = pd.read_csv("/resources/data/restricted/anonymization/matching.csv")
data

In [None]:
IA2_LABELS

In [None]:
import re
import os
import functools
from collections import Counter

from aymurai.utils import alignment
from aymurai.text.extraction import extract_document

OUTPUT_DIR = "/resources/data/restricted/anonymization"

full_data = pd.DataFrame()
for i, row in tqdm(data.iterrows(), total=len(data)):
    mapping = pd.read_csv(row["alignment_path"])
    original = extract_document(row["matching_path"])

    mapping = alignment.core.add_empty_lines_between_paragraphs(original, mapping)  #

    diff = mapping["original"] != mapping["anonymized"]
    diff_text = "".join(mapping.loc[diff, "anonymized"].fillna(""))
    labels = set(re.findall(r"<\w+>", diff_text))
    labels = [alignment.ia2.normalize(label) for label in labels]

    _norm_ia2_label = functools.partial(alignment.ia2.norm_ia2_label, labels=labels)

    mask = mapping["original"] != mapping["anonymized"]
    mapping.loc[mask, "label"] = mapping.loc[mask, "anonymized"]
    mapping["label"] = mapping["label"].apply(_norm_ia2_label)
    mapping["label"] = mapping["label"].apply(lambda x: IA2_LABELS_MAPPING.get(x))
    mapping["label"] = alignment.ia2.label_to_conll_format(mapping["label"])
    mapping["label"] = mapping["label"].fillna("O")

    # patch blank lines in labels
    mask = mapping["original"] == ""
    mapping.loc[mask] = ""

    # export mapping
    filename = f"{OUTPUT_DIR}/annotation/{row['tomo']}_{row['nro_registro']}.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    data.loc[i, "annotation_path"] = filename
    mapping.to_csv(filename, index=False)

    mapping.loc[:, row.index] = row.values
    full_data = pd.concat([full_data, mapping], ignore_index=True)

In [None]:
from aymurai.utils.display.pandas import pandas_context
from aymurai.text.extraction import extract_document
from rich.pretty import pprint
import functools
import re

options = {
    "display.max_rows": 500,
    "display.max_columns": 500,
    "display.width": 0,
    "display.max_rows": None,
}

idx = 0

example = data.sample(1).to_dict("records")[0]
# example = data.iloc[idx].to_dict()
# example = row.to_dict()
mapping = pd.read_csv(example["annotation_path"])
mapping.fillna("", inplace=True)
pprint(example)


with pandas_context(**options):
    display(mapping)

In [None]:
from datasets import Dataset, DatasetDict

full_data_ = full_data.copy()

available_labels = set(full_data_["label"])
available_labels.remove("O")
available_labels.remove("")
categories = ["O"] + list(sorted(available_labels, key=lambda x: (x[2:], x[0])))


full_data_["label"] = pd.Categorical(full_data_["label"], categories)
full_data_["label_codes"] = full_data_["label"].cat.codes

In [None]:
full_data

---

In [None]:
annots_dir = os.path.join(OUTPUT_DIR, "annotation")
annots = [os.path.join(annots_dir, file) for file in os.listdir(annots_dir)]
annots[:5]

In [None]:
to_review = []

for annot in annots:
    csv = pd.read_csv(annot)
    csv["slashes"] = (
        csv["anonymized"]
        .dropna()
        .map(lambda x: True if re.match(r"(?:\w+(?!>)/\w+/)+", x) else False)
    )
    if (
        len(
            csv.loc[
                (csv["slashes"] == True)
                & (csv["original"] != csv["anonymized"])
                & (csv["label"] == "O")
            ]
        )
        > 0
    ):
        to_review.append(annot)

In [None]:
len(to_review)

In [None]:
sorted(to_review)  # Hay que removerlos