In [None]:
import srsly
import pandas as pd
from tqdm.auto import tqdm

In [None]:
public = pd.read_csv(
    "/workspace/resources/data/dump-20230630/set_de_datos_con_perspectiva_de_genero.csv"
)

public.dropna(subset=["NRO_REGISTRO", "TOMO", "FECHA_RESOLUCION"], inplace=True)
public.rename(columns={c: c.lower() for c in public.columns}, inplace=True)

public["tomo"] = public["tomo"].astype(int)


def fix_date(text: str):
    date = text.split("_")
    # print(date)
    if len(date) != 3:
        return

    for i, num in enumerate(date):
        num = int(num)
        date[i] = f"{num:02.0f}"
    try:
        return pd.to_datetime("/".join(date))
    except:
        return


public["fecha_resolucion"] = [
    fix_date(date) for date in tqdm(public["fecha_resolucion"])
]

public

In [None]:
public.info()

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(20, 10))
public.groupby(public["fecha_resolucion"].dt.to_period("M"))["link_csjn"].count().plot(kind="bar")

In [None]:
public.groupby(public["fecha_resolucion"].dt.to_period("Y"))["link_csjn"].count()

In [None]:
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)

annotations = ArgentinaJuzgadoPCyF10LabelStudioAnnotations("/resources/annotations")

In [None]:
annotations.data[0]

In [None]:
def text_normalize(text: str):
    text = text.replace("\/", "/")
    return text


metadata = pd.DataFrame(
    [x["metadata"] | x["data"] | {"original_path": x["path"]} for x in annotations.data]
)
metadata.dropna(subset=["tomo"], inplace=True)
metadata = metadata[metadata["tomo"].str.isnumeric()]
metadata["tomo"] = metadata["tomo"].astype(int)
metadata["original_path"] = metadata["original_path"].apply(text_normalize)
metadata["doc.text"] = metadata["doc.text"].apply(text_normalize)

# metadata['tomo'] = metadata['tomo'].astype(int)
metadata.drop_duplicates(["tomo", "nro_registro"], inplace=True)
metadata

In [None]:
# Inner join
pd.merge(public, metadata, on=["nro_registro", "tomo"], how="inner").info()

In [None]:
# Inner join - drop duplicates
pd.merge(public, metadata, on=["nro_registro", "tomo"], how="inner").drop_duplicates(["tomo", "nro_registro"], inplace=False).info()

In [None]:
# Left join
x = pd.merge(public, metadata, on=["nro_registro", "tomo"], how="left")
x.drop_duplicates(["tomo", "nro_registro"], inplace=True)
x.info()

In [None]:
x.sample(5)

In [None]:
x["link"].notna().sum()

In [None]:
x.groupby(x["fecha_resolucion"].dt.to_period("Y"))["link"].count()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))["original_path"].count()

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = "/resources/data/documents"
os.makedirs(outdir, exist_ok=True)


def get_file(url: str):
    if not isinstance(url, str):
        return

    fname = f"{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}"

    if os.path.exists(fname):
        return fname

    # # algunos archivos requieren acceso, otros estan corruptos
    # cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    try:
        fname = gdown.download(url, fname, quiet=False, fuzzy=True, resume=True)
        # output = subprocess.getoutput(cmd)
    except:
        return
    
    # if "Access denied" in output:
        # return
    
    return fname


parallel = Parallel(n_jobs=50, backend="threading")
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(x["link"]))

In [None]:
x["anonymized_path"] = path

In [None]:
x.sample(5)

In [None]:
x["anonymized_path"].notna().sum(), x["anonymized_path"].isna().sum()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))["original_path"].count()

In [None]:
x.dropna(subset=["original_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))["original_path"].count()

In [None]:
x.dropna(subset=["anonymized_path"]).groupby(x["fecha_resolucion"].dt.to_period("Y"))["anonymized_path"].count()

In [None]:
# 2021 en adelante, con versión anonymizada
mask = (x["fecha_resolucion"].dt.to_period("Y") >= "2021") & (x["anonymized_path"].notna())
x.loc[mask]

In [None]:
import textract

IA2_START_DATE = pd.to_datetime("01/01/2021")


def load_doc(path: str):
    doc = textract.process(path, extension="odt")
    return doc.decode("utf-8")


def ia2(row):
    path = row["anonymized_path"]
    date = row["fecha_resolucion"]
    if date < IA2_START_DATE:
        return False
    try:
        doc = load_doc(path)
    except Exception as e:
        return False
    return "XX" not in doc

In [None]:
x["ia2"] = [ia2(row) for i, row in tqdm(x.iterrows())]

In [None]:
x.query("ia2")["fecha_resolucion"].describe(percentiles=[.01, .05, .25, .5, .75, 0.95, .99])

In [None]:
x.loc[mask]["original_path"].isna().sum(), x.loc[mask]["original_path"].notna().sum()

In [None]:
# Filtro de resoluciones sin versión original pero con versión anonymizada
mask &= (x["original_path"].isna()) & (x["ia2"])
x.loc[mask].info()

In [None]:
x.loc[mask]["fecha_resolucion"].describe()

In [None]:
# Desde IA2, sin versión original
faltantes = x.loc[mask]
faltantes.drop(["doc.text", "original_path", "anonymized_path", "ia2"], axis=1).to_excel(
    "resos_originales_faltantes.xlsx", index=False
)

In [None]:
faltantes.info()

In [None]:
faltantes.tail()

In [None]:
# Antes de IA2
x.query("not ia2")["fecha_resolucion"].describe(percentiles=[.01, .05, .25, .5, .75, 0.95, .99])

In [None]:
x.query("not ia2")["original_path"].to_csv(
    "need_preprocess.csv", index=False, header=False
)

# Ejemplo 1


In [None]:
sample = x.loc[(x["doc.text"].notna()) & (x["anonymized_path"].notna())].sample().iloc[0]

text = sample["doc.text"]
path = sample["anonymized_path"]
print(path)

In [None]:
import textract

doc = textract.process(path, extension="odt", output_encoding="utf-8")
doc = doc.decode("utf-8")
print(doc)

In [None]:
import re
from difflib import SequenceMatcher, Differ
import spacy

from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex


def custom_tokenizer(nlp):
    # infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefixes = list(nlp.Defaults.prefixes)
    prefixes.remove("<")
    suffixes = list(nlp.Defaults.suffixes)
    suffixes.remove(">")
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    return Tokenizer(
        nlp.vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        # infix_finditer=infix_re.finditer,
        token_match=None,
    )


nlp = spacy.blank("es")
nlp.tokenizer = custom_tokenizer(nlp)

# splitted_text =  re.split('\s+', text)
# splitted_doc = re.split('\s+', text)
splitted_text = [t.text for t in nlp(text)]
splitted_doc = [t.text for t in nlp(doc)]

seqmatcher = SequenceMatcher(None, splitted_text, splitted_doc)

In [None]:
matches = seqmatcher.get_matching_blocks()

for match in matches:
    print(match)
    print(splitted_text[match.a : match.a + match.size])
    print(splitted_doc[match.b : match.b + match.size])
    print()

In [None]:
for match1, match2 in zip(matches, matches[1:]):
    print(splitted_text[match1.a : match1.a + match1.size])
    print(splitted_doc[match1.b : match1.b + match1.size])
    # print("----")
    diff = Differ()
    print(
        "\n".join(
            diff.compare(
                splitted_text[match1.a + match1.size : match2.a],
                splitted_doc[match1.b + match1.size : match2.b],
            )
        )
    )
    print("----")

# Ejemplo 2


In [None]:
sample = x.iloc[703]

text = sample["doc.text"]
path = sample["anonymized_path"]
print(path)

In [None]:
import textract

doc = textract.process(path, extension="odt", output_encoding="utf-8")
doc = doc.decode("utf-8")
print(doc)

In [None]:
import re
from difflib import SequenceMatcher, Differ
import spacy

nlp = spacy.blank("es")

# splitted_text =  re.split('\s+', text)
# splitted_doc = re.split('\s+', text)
# splitted_text = [t.text for t in nlp(text)]
splitted_doc = [t.text for t in nlp(doc)]

seqmatcher = SequenceMatcher(None, splitted_text, splitted_doc)

In [None]:
matches = seqmatcher.get_matching_blocks()

for match in matches:
    print(match)
    print(splitted_text[match.a : match.a + match.size])
    print(splitted_doc[match.b : match.b + match.size])
    print()

In [None]:
for match1, match2 in zip(matches, matches[1:]):
    print(splitted_text[match1.a : match1.a + match1.size])
    print(splitted_doc[match1.b : match1.b + match1.size])
    # print("----")
    diff = Differ()
    print(
        "\n".join(
            diff.compare(
                splitted_text[match1.a + match1.size : match2.a],
                splitted_doc[match1.b + match1.size : match2.b],
            )
        )
    )
    print("----")

# Batch process

In [None]:
ia2_data = x.query('ia2')
ia2_data.dropna(subset=['doc.text'], inplace=True)
ia2_data

In [None]:
example = ia2_data.iloc[1]

original = example['doc.text']
anonymized = textract.process(example['anonymized_path'], extension="odt", output_encoding="utf-8")
anonymized = anonymized.decode("utf-8")

In [None]:
original.splitlines()

In [None]:
anonymized.splitlines()

In [None]:
a = anonymized.splitlines()
b = original.splitlines()
seqmatcher = SequenceMatcher(None, a, b)
matches = seqmatcher.get_matching_blocks()

offset_lines = matches[0]

anon_offset = '\n'.join(a[:offset_lines.a])
anon_offset = len(anon_offset)

orig_offset = '\n'.join(b[:offset_lines.b])
orig_offset = len(orig_offset)

aa = [t.text for t in nlp(anonymized[anon_offset:])]
bb = [t.text for t in nlp(original[orig_offset:])]

seqmatcher = SequenceMatcher(None, aa, bb)
matches = seqmatcher.get_matching_blocks()

for match in matches:
    print(match)
    print(aa[match.a : match.a + match.size])
    print(bb[match.b : match.b + match.size])
    print()

In [None]:
!wget -c https://raw.githubusercontent.com/instituciones-abiertas/ia2-cli/main/NER.md

In [None]:
import re 
from unidecode import unidecode

labels = []
with open('NER.md', 'r') as file:
    for line in file.readlines():
        line = unidecode(line)
        label_candidate = re.findall('^| ([A-Z]+) .*', line)
        labels.extend(label_candidate)

labels = set(labels)
labels.remove('')
labels = [f'<{label}>' for label in labels]



labels

In [None]:
import re
from difflib import SequenceMatcher, Differ
import pandas as pd

import textract
from more_itertools import zip_offset



def tokenize(text):
    tokens = []
    lines = text.splitlines()
    for line in text.splitlines():
        tokens.extend(line.split())
    return tokens


def aligner(anonymized: str, original: str):
    anon_lines = anonymized.splitlines()
    orig_lines = original.splitlines()
    seqmatcher = SequenceMatcher(None, anon_lines, orig_lines)
    matches = seqmatcher.get_matching_blocks()

    offset_lines = matches[0]

    anon_offset = "\n".join(anon_lines[: offset_lines.a])
    anon_offset = len(anon_offset)

    orig_offset = "\n".join(orig_lines[: offset_lines.b])
    orig_offset = len(orig_offset)

    mapping = pd.DataFrame(
        [
            {
                "original": t1 if t1 else "",
                "anonymized": t2 if t2 else "",
                "ia2_label": None,
                "conll_label": None
            }
            for t1, t2 in zip_offset(
                reversed(tokenize(original[:orig_offset])),
                reversed(tokenize(anonymized[:anon_offset])),
                offsets=(0, 0),
                longest=True,
            )
        ]
    ).iloc[::-1]

    anon_tokens = [t.strip() for t in tokenize(anonymized[anon_offset:])]
    orig_tokens = [t.strip() for t in tokenize(original[orig_offset:])]

    seqmatcher = SequenceMatcher(None, anon_tokens, orig_tokens)
    matches = seqmatcher.get_matching_blocks()

    for match1, match2 in zip(matches, matches[1:]):
        _aux = {
            "original": orig_tokens[match1.b : match1.b + match1.size],
            "anonymized": anon_tokens[match1.a : match1.a + match1.size],
        }
        mapping = pd.concat([mapping, pd.DataFrame(_aux)], ignore_index=True)

        diff = Differ().compare(
            anon_tokens[match1.a + match1.size : match2.a],
            orig_tokens[match1.b + match1.size : match2.b],
        )
        diff = list(diff)
        label_candidate = [t[2:].strip() for t in diff if t.startswith("-")]
        text = [t[2:].strip() for t in diff if t.startswith("+")]

        if not label_candidate:
            continue
        if len(label_candidate) > 1:
            # print("multiple labels. skipping.")
            continue

        label = "/".join(label_candidate)
        conll_label = re.sub("<|>", "", label)
        _aux = [
            {
                "original": t,
                "anonymized": label,
                "ia2_label": label,
                "conll_label": f"{'B' if i==0 else 'I'}-{conll_label}",
            }
            for i, t in enumerate(text)
        ]

        mapping = pd.concat([mapping, pd.DataFrame(_aux)], ignore_index=True)

    mapping["conll_label"] = mapping["conll_label"].fillna("O")
    mapping = mapping.dropna(how="all")
    return mapping.reset_index()


def process(item):
    original = item["doc.text"]
    path = item["anonymized_path"]
    anonymized = textract.process(path, extension="odt", output_encoding="utf-8")
    anonymized = anonymized.decode("utf-8")

    return aligner(anonymized, original)

In [None]:
process(ia2_data.iloc[1])

In [None]:
import os

def mapping2conll(df, filename):
    dir = os.path.dirname(filename)
    os.makedirs(dir, exist_ok=True)
    with open(filename, 'w') as file:
        print('-DOCSTART- -X- O', file=file)
        for _, row in df.iterrows():
            text = row['original']
            label = row['conll_label']
            print(f'{text} -X- _ {label}', file=file)


In [None]:
for _, row in tqdm(ia2_data.iterrows(), total=len(ia2_data)):
    filename = f"output/{row['tomo']}_{row['nro_registro']}.conll"
    mapping = process(row)
    mapping2conll(mapping, filename)