In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20230630/set_de_datos_con_perspectiva_de_genero.csv"
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

In [None]:
public.info()

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations(
    "/workspace/resources/annotations"
)

In [None]:
from aymurai.utils.misc import get_element


def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [
        x["metadata"]
        | x["data"]
        | {
            "original_path": x["path"],
            "conll": get_element(x, levels=["annotations", "conll"]),
        }
        for x in annotations.data
    ]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
# Left join
x = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="left")
x.drop_duplicates(["tomo", "nro_registro"], inplace=True)
x.info()

In [None]:
x.sample(5)

In [None]:
x["link"].notna().sum()

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["link"].count()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "original_path"
].count()

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # # algunos archivos requieren acceso, otros estan corruptos
    # cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    try:
        pass
        # fname = gdown.download(url, fname, quiet=False, fuanon_tfidfzzy=True, resume=True)
        # output = subprocess.getoutput(cmd)
    except:
        return

    # if "Access denied" in output:
    # return

    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(x["link"]))

In [None]:
x["anonymized_path"] = path

In [None]:
x.sample(5)

In [None]:
x["anonymized_path"].notna().sum(), x["anonymized_path"].isna().sum()

In [None]:
x.dropna(subset=["anonymized_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "anonymized_path"
].count()

In [None]:
x["original_path"].notna().sum(), x["original_path"].isna().sum()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "original_path"
].count()

In [None]:
# 2021 - 2022, con versión anonimizada y sin versión original
mask = (
    (x["fecha_resolucion"].dt.to_period("Y") >= "2021")
    & (x["fecha_resolucion"].dt.to_period("Y") <= "2022")
    & (x["anonymized_path"].notna())
    & (x["original_path"].isna())
)
x = x.loc[mask]
x

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["anonymized_path"].count()

In [None]:
import textract

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    return doc.decode("utf-8")


def ia2(row):
    path = row["anonymized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc

In [None]:
x["ia2"] = [ia2(row) for i, row in tqdm(x.iterrows())]

In [None]:
x.query("ia2")["fecha_resolucion"].describe(
    percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
)

In [None]:
# Filtro de resoluciones sin versión original pero con versión anonymizada
mask &= (x["original_path"].isna()) & (x["ia2"])
x = x.loc[mask]
x.info()

In [None]:
x.loc[mask]["fecha_resolucion"].describe()

In [None]:
x["anonymized_path"].count()

In [None]:
x.columns

In [None]:
x["anonymized_path"].isna().sum()

In [None]:
x["anonymized_text"] = x["anonymized_path"].map(load_doc)

In [None]:
x[["anonymized_path", "anonymized_text"]].sample()

In [None]:
x["anonymized_text"].info()

In [None]:
from aymurai.text.normalize import document_normalize

In [None]:
x["anonymized_text"] = x["anonymized_text"].map(text_normalize).map(document_normalize)

In [None]:
from glob import glob

originals = glob(
    "/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO/**/**.doc",
    recursive=True,
) + glob(
    "/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO/**/**.docx",
    recursive=True,
)

len(originals)

In [None]:
originals[-5:]

In [None]:
original_paths = (
    metadata["original_path"]
    .str.replace("/resources/", "/resources/data/")
    .str.replace("/RESOLUCIONES DEL JUZGADO - DOCS/", "/RESOLUCIONES DEL JUZGADO/")
    .unique()
)

In [None]:
originals = [original for original in originals if original not in original_paths]
len(originals)

In [None]:
def load_word_doc(path: str):
    try:
        doc = (
            textract.process(path, extension="doc")
            if path.endswith(".doc")
            else textract.process(path, extension="docx")
        )

    except:
        return

    return doc.decode("utf-8")

In [None]:
original_docs = list(map(load_word_doc, originals))

In [None]:
original_docs = [doc for doc in original_docs if doc]

In [None]:
original_docs[0]

In [None]:
original_docs = list(map(text_normalize, original_docs))
original_docs = list(map(document_normalize, original_docs))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer()
anon_tfidf = tfidf.fit_transform(x["anonymized_text"])
anon_tfidf

In [None]:
orig_tfidf = tfidf.transform(original_docs)
orig_tfidf

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarities = cosine_similarity(anon_tfidf, orig_tfidf)
cosine_similarities.shape

In [None]:
import numpy as np

In [None]:
np.argmax(cosine_similarities, axis=1)

In [None]:
x["anonymized_text"].iloc[0]

In [None]:
original_docs[1674]