In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

# load public data

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20230630/set_de_datos_con_perspectiva_de_genero.csv"
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

# load anotations

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)
from aymurai.datasets.ar_juz_pcyf_10.public import ArgentinaJuzgadoPCyF10PublicDataset

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations(
    "/workspace/resources/data/restricted/annotations/20221130-bis"
)

# merge and filtering

In [None]:
from aymurai.utils.misc import get_element


def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [
        x["metadata"]
        | x["data"]
        | {
            "original_path": x["path"],
            "conll": get_element(x, levels=["annotations", "conll"]),
        }
        for x in annotations.data
    ]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
data = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="left")
data.drop_duplicates(["tomo", "nro_registro"], inplace=True)
data.info()

In [None]:
# %%capture

import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from functools import cache

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


@cache
def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # algunos archivos requieren acceso, otros estan corruptos
    # use subprocess to handle access denied logging
    cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    try:
        # fname = gdown.download(url, fname, quiet=False, fuzzy=True, resume=True)
        output = subprocess.getoutput(cmd)
    except:
        return

    # if "Access denied" in output:
    # return

    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(data["link"]))

In [None]:
data["anonymized_path"] = path

# filter to handle only available ia2 data

In [None]:
import textract
from aymurai.text.normalize import document_normalize

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    doc = doc.decode("utf-8")
    return document_normalize(doc)


def ia2(row):
    path = row["anonymized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc


data["ia2_processed"] = [ia2(row) for i, row in tqdm(data.iterrows())]
data.query("ia2_processed", inplace=True)
data.dropna(subset=["doc.text"], inplace=True)
data["doc.text"] = data["doc.text"].apply(document_normalize)

In [None]:
from aymurai.utils import alignment

OUTPUT_DIR = "/resources/data/restricted/anonymization/ia2-aligned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for _, row in tqdm(data.iterrows(), total=len(data)):
    mapping = alignment.process(row)

    filename = f"{OUTPUT_DIR}/{row['tomo']}_{row['nro_registro']}.csv"
    mapping.to_csv(filename)