In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import srsly
import pandas as pd

from glob import glob
from tqdm.auto import tqdm

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20230630/set_de_datos_con_perspectiva_de_genero.csv"
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations(
    "/workspace/resources/annotations"
)

In [None]:
from aymurai.utils.misc import get_element


def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [
        x["metadata"]
        | x["data"]
        | {
            "original_path": x["path"],
            # "conll": get_element(x, levels=["annotations", "conll"]),
        }
        for x in annotations.data
    ]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
# Inner join
x = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="inner")
x.drop_duplicates(["tomo", "nro_registro"], inplace=True)
x.info()

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # # algunos archivos requieren acceso, otros estan corruptos
    # cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    try:
        pass
        # fname = gdown.download(url, fname, quiet=False, fuanon_tfidfzzy=True, resume=True)
        # output = subprocess.getoutput(cmd)
    except:
        return

    # if "Access denied" in output:
    # return

    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(x["link"]))

In [None]:
x["anonymized_path"] = path

In [None]:
x.sample(5)

In [None]:
x.info()

In [None]:
# 2021 - 2022, con versión anonimizada
mask = (
    (x["fecha_resolucion"].dt.to_period("Y") >= "2021")
    & (x["fecha_resolucion"].dt.to_period("Y") <= "2022")
    & (x["anonymized_path"].notna())
)
x = x.loc[mask]
x

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["anonymized_path"].count()

In [None]:
import textract

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    return doc.decode("utf-8")


def ia2(row):
    path = row["anonymized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc

In [None]:
x["ia2"] = [ia2(row) for i, row in tqdm(x.iterrows())]

In [None]:
x.query("ia2")["fecha_resolucion"].describe(
    percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
)

In [None]:
# Filtro de resoluciones con etiquetas de ia2
mask &= x["ia2"]
x = x.loc[mask]
x.info()

In [None]:
x["original_path"] = (
    x["original_path"]
    .str.replace("/resources/", "/resources/data/")
    .str.replace("/RESOLUCIONES DEL JUZGADO - DOCS/", "/RESOLUCIONES DEL JUZGADO/")
)

In [None]:
import os
import re
from aymurai.utils import alignment
from collections import Counter


OUTPUT_DIR = "/resources/data/restricted/anonymization"
os.makedirs(OUTPUT_DIR, exist_ok=True)

labels = Counter()

x.reset_index(drop=True, inplace=True)

for i, row in tqdm(x.iterrows(), total=len(x)):
    original_path = row["original_path"]
    anonymized_path = row["anonymized_path"]

    mapping = alignment.align_docs(
        original_path,
        anonymized_path,
        columns=("original", "anonymized"),
        target_preprocess=alignment.ia2.ia2_text_preprocess,
    )

    # alignment_score
    diff = mapping["original"] != mapping["anonymized"]
    alignment_score = (~diff).mean()
    x.loc[i, "alignment_score"] = alignment_score

    # export alignment
    filename = f"{OUTPUT_DIR}/alignment/{row['tomo']}_{row['nro_registro']}.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    x.loc[i, "alignment_path"] = filename
    mapping.to_csv(filename, index=False)

    diff_text = "".join(mapping.loc[diff, "anonymized"])
    labels_ = re.findall(r"<\w+>", diff_text)
    labels_ = [alignment.ia2.normalize(label) for label in labels_]
    labels += Counter(labels_)

In [None]:
columns = [
    "nro_registro",
    "tomo",
    "fecha_resolucion",
    "n_expte_eje",
    "original_path",
    "anonymized_path",
    "doc.text",
    "alignment_path",
    "alignment_score",
]

x[columns].head()

In [None]:
x[columns].sort_values("alignment_score", ascending=False)

In [None]:
x["alignment_score"].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
x.query("alignment_score < 0.7")["alignment_path"].values

In [None]:
import re
import os
import functools
from collections import Counter

from aymurai.utils import alignment
from aymurai.text.extraction import extract_document


IA2_LABELS = pd.read_csv(
    "/resources/data/restricted/anonymization/ia2_labels_manual.csv"
)
IA2_LABELS_MAPPING = {x["label"]: x["mapping"] for _, x in IA2_LABELS.iterrows()}

for i, row in tqdm(x.iterrows(), total=len(x)):
    mapping = pd.read_csv(row["alignment_path"])
    original = extract_document(row["original_path"])

    mapping = alignment.core.add_empty_lines_between_paragraphs(original, mapping)

    diff = mapping["original"] != mapping["anonymized"]
    diff_text = "".join(mapping.loc[diff, "anonymized"].fillna(""))
    labels = set(re.findall(r"<\w+>", diff_text))
    labels = [alignment.ia2.normalize(label) for label in labels]

    _norm_ia2_label = functools.partial(alignment.ia2.norm_ia2_label, labels=labels)

    mask = mapping["original"] != mapping["anonymized"]
    mapping.loc[mask, "label"] = mapping.loc[mask, "anonymized"]
    mapping["label"] = mapping["label"].apply(_norm_ia2_label)
    mapping["label"] = mapping["label"].apply(lambda x: IA2_LABELS_MAPPING.get(x.strip()) if isinstance(x, str) else None)
    mapping["label"] = alignment.ia2.label_to_conll_format(mapping["label"])
    mapping["label"] = mapping["label"].fillna("O")

    if len(mapping.loc[mapping["label"].str.contains("NUM_DOMINIO")]) > 1:
        display(mapping.loc[mapping["label"].str.contains("NUM_DOMINIO")])

    # patch blank lines in labels
    mask = mapping["original"] == ""
    mapping.loc[mask] = ""

    # export mapping
    filename = f"{OUTPUT_DIR}/annotation/{row['tomo']}_{row['nro_registro']}.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    x.loc[i, "annotation_path"] = filename
    mapping.to_csv(filename, index=False)

    mapping.loc[:, row.index] = row.values

In [None]:
ANNOT_DIR = "/resources/data/restricted/anonymization/annotation/"

TO_REMOVE = [
    "38_3531.csv",
    "42_3831.csv",
    "36_3404.0.csv",
    "36_3406.0.csv",
    "36_3416.0.csv",
    "37_3449.0.csv",
    "38_3519.0.csv",
    "40_3724.0.csv",
    "41_3772.0.csv",
    "41_3793.0.csv",
    "41_3800.0.csv",
    "42_3821.1.csv",
    "43_3918.0.csv",
    "44_4044.0.csv",
    "44_4049.0.csv",
    "47_4234.0.csv",
    "47_4235.0.csv",
    "47_4319.0.csv",
    "50_4594.0.csv",
    "40_3721.csv",  # el problema es la nota al pie de página 
    "39_3613.csv",  # el problema es la nota al pie de página 
    "40_3674.csv",  # el problema es la nota al pie de página 
    "35_3328.csv"  # el problema es la nota al pie de página 
]

In [None]:
data = [csv for csv in os.listdir(ANNOT_DIR) if csv not in TO_REMOVE]
len(data)

In [None]:
sorted(data)

In [None]:
from aymurai.utils.display.pandas import pandas_context
from rich.pretty import pprint
import functools
import random
import re


options = {
    "display.max_rows": 500,
    "display.max_columns": 500,
    "display.width": 0,
    "display.max_rows": None,
}

example = random.choice(data)
mapping = pd.read_csv(os.path.join(ANNOT_DIR, example))
mapping.fillna("", inplace=True)
pprint(example)

# with pandas_context(**options):
# display(mapping.head())

In [None]:
mask = (
    mapping["anonymized"].map(
        lambda x: True if re.match(r"(?:<\w+>\|<?\w+>?\|)+", x) else False
    )
) & (mapping["label"] == "O")


mapping.loc[mask]

In [None]:
full_data = pd.concat(
    [
        pd.concat(
            [
                pd.read_csv(os.path.join(ANNOT_DIR, csv)),
                pd.Series(
                    [csv] * len(pd.read_csv(os.path.join(ANNOT_DIR, csv))),
                    name="filename",
                ),
            ],
            axis=1,
        )
        for csv in data
    ],
    ignore_index=True,
)
full_data

In [None]:
full_data_nan = full_data.dropna().copy()
full_data_nan.info()

In [None]:
mask = (
    full_data_nan["anonymized"].map(
        lambda x: True if re.match(r"(?:<\w+>\|<?\w+>?\|)+", x) else False
    )
) & (full_data_nan["label"] == "O")

# with pandas_context(**options):
#     display(full_data.loc[mask])

In [None]:
len(full_data_nan.loc[mask]) / len(full_data_nan) * 100

In [None]:
full_data_nan.loc[mask, "label"] = None

In [None]:
full_data_nan["label"].isna().sum()

In [None]:
full_data.loc[full_data_nan.loc[mask].index, "label"] = None
full_data.info()

In [None]:
full_data["label"].value_counts()

In [None]:
full_data["label"].dropna().map(lambda x: re.sub(r"^[BI]-", "", x)).value_counts()

In [None]:
full_data.dropna().loc[full_data.dropna()["label"].str.contains("NUM_DOMINIO")]

In [None]:
nums = (
    full_data.dropna()
    .loc[full_data["label"].dropna().str.contains("NUM_$"), "anonymized"]
    .unique()
)
nums

In [None]:
mask_num = full_data.dropna()["label"].str.contains("NUM_$")

In [None]:
mask_expte = mask_num & (
    full_data.dropna()["anonymized"].str.contains(
        r"EXPEDIENTE|CAUSA", regex=True
    )
)

In [None]:
full_data.dropna().loc[mask_expte]

In [None]:
expte_idx = full_data.dropna().loc[mask_expte].index
full_data.loc[expte_idx, "label"] = full_data.loc[expte_idx, "label"].str.replace(
    "NUM_", "NUM_EXPEDIENTE"
)

In [None]:
mask_dni = mask_num & (full_data.dropna()["anonymized"].str.contains("DNI"))

In [None]:
full_data.dropna().loc[mask_dni]

In [None]:
dni_idx = full_data.dropna().loc[mask_dni].index
full_data.loc[dni_idx, "label"] = full_data.loc[dni_idx, "label"].str.replace(
    "NUM_", "DNI"
)

In [None]:
mask_cuij = mask_num & (full_data.dropna()["anonymized"].str.contains("CUIJ"))

In [None]:
full_data.dropna().loc[mask_cuij]

In [None]:
cuij_idx = full_data.dropna().loc[mask_cuij].index
full_data.loc[cuij_idx, "label"] = full_data.loc[cuij_idx, "label"].str.replace(
    "NUM_", "CUIJ"
)

In [None]:
mask_num_ = mask_num & (full_data.dropna()["label"].str.endswith("NUM_"))

In [None]:
num_idx = full_data.dropna().loc[mask_num_].index
full_data.loc[num_idx, "label"] = full_data.loc[num_idx, "label"].str.replace(
    "NUM_", "NUM"
)

In [None]:
# FIXME este mapeo debería hacerser automáticamente a partir de IA2_LABELS_MAPPING
# full_data["label"] = full_data["label"].str.replace("NUM_ACTUACION", "NUM_EXPEDIENTE")
full_data["label"] = full_data["label"].str.replace(
    "DOMINIO_PATENTE", "PATENTE_DOMINIO"
)
full_data["label"] = full_data["label"].str.replace("NUM_DOMINIO", "PATENTE_DOMINIO")

In [None]:
full_data.dropna().loc[full_data.dropna()["label"].str.contains("HECHO")]

In [None]:
full_data["label"] = full_data["label"].str.replace("HECHO", "FECHA")

In [None]:
full_data["label"].dropna().map(lambda x: re.sub(r"^[BI]-", "", x)).value_counts()

In [None]:
full_data.query("label == 'I-NUM'")

In [None]:
# TODO anular labels para conectores tipo y/o
full_data.query("(original == 'y' or original == 'Y') and (label != 'O')")

In [None]:
full_data.loc[(full_data["label"].isna()) & (full_data["original"].notna())]

## TODOs

1. Remover de cada resolución los párrafos con etiquetas ambiguas y persistir los csvs individuales. Pensar en cómo hacerlo considerando los índices de inicio de fin de cada párrafo a remover
2. Hacer el train - val - test split con proporciones 0.75 - 0.125 - 0.125, que da aproximadamente 418 - 70 - 70 resos.
3. Shufflear los párrafos de train.
4. Eliminar párrafos duplicados en todos los sets.
5. Entrenar y validar con Flair y con LORA.
6. Hacer downsampling de los párrafos sin etiquetas y repetir.

In [None]:
to_review = []

for annot in data:
    csv = pd.read_csv(os.path.join(ANNOT_DIR, annot))
    csv["pipes"] = (
        csv["anonymized"]
        .dropna()
        .map(lambda x: True if re.match(r"(?:\w+(?!>)\|\w+\|)+", x) else False)
    )
    if (
        len(
            csv.loc[
                (csv["pipes"] == True)
                & (csv["original"] != csv["anonymized"])
                & (csv["label"] == "O")
            ]
        )
        > 0
    ):
        to_review.append(annot)

to_review

In [None]:
to_review

In [None]:
len(to_review)