In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20230630/set_de_datos_con_perspectiva_de_genero.csv"
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

In [None]:
public.info()

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations(
    "/workspace/resources/annotations"
)

In [None]:
from aymurai.utils.misc import get_element


def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [
        x["metadata"]
        | x["data"]
        | {
            "original_path": x["path"],
            "conll": get_element(x, levels=["annotations", "conll"]),
        }
        for x in annotations.data
    ]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
# Left join
x = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="left")
x.drop_duplicates(["tomo", "nro_registro"], inplace=True)
x.info()

In [None]:
x.sample(5)

In [None]:
x["link"].notna().sum()

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["link"].count()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "original_path"
].count()

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # # algunos archivos requieren acceso, otros estan corruptos
    # cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    try:
        pass
        # fname = gdown.download(url, fname, quiet=False, fuanon_tfidfzzy=True, resume=True)
        # output = subprocess.getoutput(cmd)
    except:
        return

    # if "Access denied" in output:
    # return

    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(x["link"]))

In [None]:
x["anonymized_path"] = path

In [None]:
x.sample(5)

In [None]:
x["anonymized_path"].notna().sum(), x["anonymized_path"].isna().sum()

In [None]:
x.dropna(subset=["anonymized_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "anonymized_path"
].count()

In [None]:
x["original_path"].notna().sum(), x["original_path"].isna().sum()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))[
    "original_path"
].count()

In [None]:
# 2021 - 2022, con versión anonimizada y sin versión original
mask = (
    (x["fecha_resolucion"].dt.to_period("Y") >= "2021")
    & (x["fecha_resolucion"].dt.to_period("Y") <= "2022")
    & (x["anonymized_path"].notna())
    & (x["original_path"].isna())
)
x = x.loc[mask]
x

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["anonymized_path"].count()

In [None]:
import textract

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    return doc.decode("utf-8")


def ia2(row):
    path = row["anonymized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc

In [None]:
x["ia2"] = [ia2(row) for i, row in tqdm(x.iterrows())]

In [None]:
x.query("ia2")["fecha_resolucion"].describe(
    percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
)

In [None]:
# Filtro de resoluciones sin versión original pero con versión anonimizada
mask &= (x["original_path"].isna()) & (x["ia2"])
x = x.loc[mask]
x.info()

In [None]:
x.loc[mask]["fecha_resolucion"].describe()

In [None]:
x["anonymized_path"].count()

In [None]:
x["anonymized_path"].isna().sum()

In [None]:
x["anonymized_text"] = x["anonymized_path"].map(load_doc)

In [None]:
x[["anonymized_path", "anonymized_text"]].sample()

In [None]:
x["anonymized_text"].info()

In [None]:
from aymurai.text.normalize import document_normalize

In [None]:
x["anonymized_text"] = x["anonymized_text"].map(text_normalize).map(document_normalize)

In [None]:
from glob import glob
from aymurai.text.extraction import extract_document

BASEPATH = "/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO"
original = glob(f"{BASEPATH}/**/**.doc", recursive=True)
original += glob(f"{BASEPATH}/**/**.docx", recursive=True)


already_matched_files = (
    metadata["original_path"]
    .str.replace("/resources/", "/resources/data/")
    .str.replace("/RESOLUCIONES DEL JUZGADO - DOCS/", "/RESOLUCIONES DEL JUZGADO/")
    .unique()
)

original = list(set(original) - set(already_matched_files))

print("anonimyzed available:", len(x))
print("already matched files:", len(already_matched_files))
print("need to be matched:", len(original))

# convert to dataframe to easier usage
original = pd.DataFrame(dict(path=original))
original["text"] = original["path"].progress_apply(extract_document)
original.dropna(subset=["text"], inplace=True)

original["text"] = original["text"].apply(text_normalize)
original["text"] = original["text"].apply(document_normalize)

original.reset_index(inplace=True)
print("need to be matched and available:", len(original))
original.sample(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


tfidf = TfidfVectorizer()
anon_tfidf = tfidf.fit_transform(x["anonymized_text"])
anon_tfidf

In [None]:
orig_tfidf = tfidf.transform(original["text"])
orig_tfidf

In [None]:
cosine_similarities = cosine_similarity(anon_tfidf, orig_tfidf)
cosine_similarities.shape

In [None]:
match_indices = np.argmax(cosine_similarities, axis=1)
match_indices

In [None]:
import random

idx = random.choice(range(len(match_indices)))
match_idx = match_indices[idx]

print("anonymized:", x.iloc[idx]["anonymized_path"])
print("-" * 80)
print(x.iloc[idx]["anonymized_text"][:600])

print("=" * 80)
print("original:", original.loc[match_idx, "path"])
print("-" * 80)
print(original.loc[match_idx, "text"][:600])

In [None]:
matching = x[
    [
        "nro_registro",
        "tomo",
        "fecha_resolucion",
        "n_expte_eje",
        "link",
        "doc.text",
        "original_path",
        "anonymized_path",
        "anonymized_text",
    ]
]
matching.sample(5)

In [None]:
match_indices = np.argmax(cosine_similarities, axis=1)
match_scores = np.max(cosine_similarities, axis=1)

matching["tfidf_score"] = match_scores
matching["matching_path"] = original.loc[match_indices, "path"].values
matching["matching_text"] = original.loc[match_indices, "text"].values

matching.sample(5)

In [None]:
matching["tfidf_score"].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
matching["tfidf_score"].plot(kind="hist", bins=30)

In [None]:
matching.info()

In [None]:
q1 = matching["tfidf_score"].quantile(0.01)
q5 = matching["tfidf_score"].quantile(0.05)
q25 = matching["tfidf_score"].quantile(0.25)
q50 = matching["tfidf_score"].quantile(0.5)

for i, r in matching.query("tfidf_score > @q25 and tfidf_score <= @q50").iterrows():
    print(r["tfidf_score"])
    print(r["matching_text"])
    print("=" * 225)
    print(r["anonymized_text"])
    print("=" * 225)
    print("*" * 225)

```
# Distribución de similitud de los documentos que tenemos emparejados
count    252.000000
mean       0.955130
std        0.045909
min        0.629422
1%         0.794987
5%         0.883193
25%        0.939891
50%        0.969540
75%        0.984278
95%        0.993554
99%        0.996443
max        0.996680
Name: max_similarity, dtype: float64
```

In [None]:
# Tomamos una similitud mayor al percentil 1 de la distriución conocida
matching.query("tfidf_score > 0.795").describe()

In [None]:
from scipy import stats

percentile = stats.percentileofscore(matching["tfidf_score"], 0.795)
percentile

In [None]:
for i, r in matching.query("tfidf_score > 0.795").iterrows():
    print(r["tfidf_score"])
    print(r["matching_text"])
    print("=" * 225)
    print(r["anonymized_text"])
    print("=" * 225)
    print("*" * 225)

In [None]:
matching.dropna(axis=1)

In [None]:
import os
import re
from aymurai.utils import alignment
from collections import Counter


OUTPUT_DIR = "/resources/data/restricted/anonymization"
os.makedirs(OUTPUT_DIR, exist_ok=True)

labels = Counter()
for i, row in tqdm(matching.iterrows(), total=len(matching)):
    original_path = row["matching_path"]
    anonymized_path = row["anonymized_path"]

    mapping = alignment.align_docs(
        original_path,
        anonymized_path,
        columns=("original", "anonymized"),
        target_preprocess=alignment.ia2.ia2_text_preprocess,
    )

    # alignment_score
    diff = mapping["original"] != mapping["anonymized"]
    alignment_score = (~diff).mean()
    matching.loc[i, "alignment_score"] = alignment_score

    # matching score
    matching_score = alignment_score * row["tfidf_score"]
    matching.loc[i, "matching_score"] = matching_score

    # export alignment
    filename = f"{OUTPUT_DIR}/alignment/{row['tomo']}_{row['nro_registro']}.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    matching.loc[i, "alignment_path"] = filename
    mapping.to_csv(filename, index=False)

    diff_text = "".join(mapping.loc[diff, "anonymized"])
    labels_ = re.findall(r"<\w+>", diff_text)
    labels_ = [alignment.ia2.normalize(label) for label in labels_]
    labels += Counter(labels_)

In [None]:
IA2_LABELS = pd.DataFrame(labels.items(), columns=["label", "count"])
IA2_LABELS.to_csv(f"{OUTPUT_DIR}/ia2_labels.csv", index=False)

In [None]:
drop_columns = ["doc.text", "matching_text", "anonymized_text"]
output = matching.drop(drop_columns, axis=1).dropna(axis=1, how="all")

In [None]:
from aymurai.utils.display.pandas import pandas_context
from rich.pretty import pprint

options = {
    "display.max_rows": 500,
    "display.max_columns": 500,
    "display.width": 0,
    "display.max_rows": None,
}

example = output.sample(1).to_dict("records")[0]
mapping = pd.read_csv(example["alignment_path"])
pprint(example)

with pandas_context(**options):
    display(mapping)

In [None]:
output.sample(5)

In [None]:
output["matching_score"].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
output["matching_score"].plot(kind="hist", bins=30)

In [None]:
output.sort_values("matching_score", ascending=False, inplace=True)

In [None]:
output.head(5)

In [None]:
output.tail(5)

In [None]:
data_filtered = output.query("matching_score >= 0.75")
data_filtered.info()

In [None]:
data_filtered["matching_score"].describe(
    percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
)

In [None]:
data_filtered.to_csv(f"{OUTPUT_DIR}/matching.csv", index=False)

In [None]:
matching