In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20221027/set_de_datos_con_perspectiva_de_genero-database.csv",
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations("/resources/")

In [None]:
annotations.data[0]

In [None]:
def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [x["metadata"] | x["data"] | {"original_path": x["path"]} for x in annotations.data]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
x = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="right")
x.drop_duplicates(["tomo", "nro_registro"], inplace=True)
len(x)

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # algunos archivos requieren acceso, otros estan corruptos
    cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    # fname = gdown.download(url, fname, quiet=False, fuzzy=True, resume=True)
    output = subprocess.getoutput(cmd)
    if "Access denied" in output:
        return
    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(x["link"]))
x["anonimized_path"] = path

In [None]:
x["anonimized_path"][x["anonimized_path"].isna()]

In [None]:
import textract

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    return doc.decode("utf-8")


def ia2(row):
    path = row["anonimized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc

In [None]:
x["ia2"] = [ia2(row) for i, row in tqdm(x.iterrows())]

In [None]:
print(x.query("ia2").iloc[-4]["doc.text"])
print(x.query("ia2").iloc[-4]["anonimized_path"])

In [None]:
x["ia2"].sum()

In [None]:
x.query("not ia2")["original_path"].to_csv(
    "need_preprocess.csv", index=False, header=False
)

In [None]:
x

# Ejemplo 1


In [None]:
sample = x.iloc[-2]

text = sample["doc.text"]
path = sample["anonimized_path"]
print(path)

In [None]:
import textract

doc = textract.process(path, extension="odt", output_encoding="utf-8")
doc = doc.decode("utf-8")
print(doc)

In [None]:
import re
from difflib import SequenceMatcher, Differ
import spacy

nlp = spacy.blank("es")

# splitted_text =  re.split('\s+', text)
# splitted_doc = re.split('\s+', text)
splitted_text = [t.text for t in nlp(text)]
splitted_doc = [t.text for t in nlp(doc)]

seqmatcher = SequenceMatcher(None, splitted_text, splitted_doc)

In [None]:
matches = seqmatcher.get_matching_blocks()

for match in matches:
    print(match)
    print(splitted_text[match.a : match.a + match.size])
    print(splitted_doc[match.b : match.b + match.size])
    print()

In [None]:
for match1, match2 in zip(matches, matches[1:]):
    print(splitted_text[match1.a : match1.a + match1.size])
    print(splitted_doc[match1.b : match1.b + match1.size])
    # print("----")
    diff = Differ()
    print(
        "\n".join(
            diff.compare(
                splitted_text[match1.a + match1.size : match2.a],
                splitted_doc[match1.b + match1.size : match2.b],
            )
        )
    )
    print("----")

# Ejemplo 2


In [None]:
sample = x.iloc[703]

text = sample["doc.text"]
path = sample["anonimized_path"]
print(path)

In [None]:
import textract

doc = textract.process(path, extension="odt", output_encoding="utf-8")
doc = doc.decode("utf-8")
print(doc)

In [None]:
import re
from difflib import SequenceMatcher, Differ
import spacy

nlp = spacy.blank("es")

# splitted_text =  re.split('\s+', text)
# splitted_doc = re.split('\s+', text)
splitted_text = [t.text for t in nlp(text)[180:]]
splitted_doc = [t.text for t in nlp(doc)[72:]]

seqmatcher = SequenceMatcher(None, splitted_text, splitted_doc)

In [None]:
matches = seqmatcher.get_matching_blocks()

for match in matches:
    print(match)
    print(splitted_text[match.a : match.a + match.size])
    print(splitted_doc[match.b : match.b + match.size])
    print()

In [None]:
for match1, match2 in zip(matches, matches[1:]):
    print(splitted_text[match1.a : match1.a + match1.size])
    print(splitted_doc[match1.b : match1.b + match1.size])
    # print("----")
    diff = Differ()
    print(
        "\n".join(
            diff.compare(
                splitted_text[match1.a + match1.size : match2.a],
                splitted_doc[match1.b + match1.size : match2.b],
            )
        )
    )
    print("----")