In [None]:
import os, re
import numpy as np
import pandas as pd
from glob import glob, iglob


In [None]:
data = pd.read_csv('/resources/data/preprocessed.csv')
data.drop('path', axis=1, inplace=True)


def get_registro(path: str) -> dict:
    item = {}
    item["path"] = path

    filename = os.path.basename(path)
    filename, ext = os.path.splitext(filename)

    item["doc_name"] = filename
    item["doc_extension"] = ext

    dirname = os.path.dirname(item["path"])
    match = re.findall("/(\d{4})/", dirname)
    item["year"] = match[0] if match else None

    filename = re.sub(r"\s", "_", filename)
    filename = re.sub(r"^[A-Z]\d+_", "", filename)
    item["nro_registro"] = filename.split("_")[0]

    tomo = filename.split("_")[1]
    match = re.findall("\d+", tomo)
    item["tomo"] = match[0] if match else None

    return item


paths = iglob(
    f"{os.environ['AYMURAI_RESTRICTED_DOCUMENT_PATH']}/**/*.*",
    recursive=True,
)
paths = filter(os.path.isfile, paths)
paths = list(sorted(paths))

docs = pd.DataFrame([get_registro(path) for path in paths])

docs.dropna(subset=["nro_registro", "tomo", "year"], inplace=True)
docs["nro_registro"] = docs["nro_registro"].astype(str)
docs["year"] = docs["year"].astype(int)
docs["tomo"] = docs["tomo"].astype(int)

data = data.merge(docs, on=["nro_registro", "tomo"])
paths = data["path"]

annotations = data.copy()
annotations


In [None]:
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "notebook"


In [None]:
px.histogram(data, x='date', color='materia')

In [None]:
violence_cats = [
    "violencia_de_genero",
    "v_fisica",
    "v_psic",
    "v_econ",
    "v_sex",
    "v_soc",
    "v_amb",
    "v_simb",
    "v_polit",
]
bool_violence_cats = [f"have:{cat}" for cat in violence_cats]
for cat in violence_cats:
    data[f"have:{cat}"] = data[cat].apply(lambda v: v == "si")

data["have:violence"] = data[bool_violence_cats].sum(axis=1)


In [None]:
px.histogram(
    data,
    "have:violence",
    color="have:violencia_de_genero",
    title="nro de formas de violencia presentes en cada caso",
)


In [None]:
cats = list(set(bool_violence_cats) - {"have:violencia_de_genero"})  #
data_ = data[cats].sum()
data_ = pd.DataFrame(data_).reset_index()
px.pie(
    data_,
    values=0,
    names="index",
    title="porcentage de formas de violencia presentes en total de datos",
)


## check

In [None]:
# checkeo si frases se repiten en registros distintos
duplicated = (
    data.groupby(["frases_agresion"])
    .agg({"nro_registro": "nunique"})
    .sort_values(by=["nro_registro"], ascending=False)
)
duplicated


In [None]:
caso = data.query('frases_agresion == "hija de puta me estas cagando vas a ver lo que te va a pasar no tengo nada que perder nadie te va a poder salvar"')
caso

In [None]:

data_ = data.dropna(subset=["date"])
expedientes = data_.groupby("n_expte_eje").agg(
    {
        "nro_registro": "nunique",
        "have:violence": "sum",
    }
)
expedientes.rename(
    columns={
        "nro_registro": "nro_registros",
        "have:violence": "violence",
    },
    inplace=True,
)
expedientes["tiempo_entre_registros_maxmin"] = data_.groupby("n_expte_eje").agg(
    {"date": lambda d: pd.to_datetime(d).max() - pd.to_datetime(d).min()}
)
expedientes["tiempo_entre_registros_mean"] = data_.groupby("n_expte_eje").agg(
    {"date": lambda d: pd.to_datetime(d).diff().mean()}
)
expedientes


In [None]:
px.histogram(
    expedientes,
    "nro_registros",
    color=expedientes["violence"].astype(bool),
    title="numero de resoluciones por caso",
)


# registros repetidos

In [None]:
data = annotations.copy()

duplicated = data.groupby(['nro_registro', 'n_expte_eje', 'link']).nunique().sort_values(by=['n'], ascending=False)
duplicated = data.groupby(['path']).nunique().sort_values(by=['n'], ascending=False)
# data.groupby(['nro_registro', 'tomo']).nunique().sort_values(by=['n'], ascending=False)
duplicated

In [None]:
duplicated.replace(1, np.nan).dropna(axis=1, how='all')

In [None]:
duplicated.query('n_expte_eje > 1')

# filtros

In [None]:
pd.set_option('display.max_colwidth', -1)
data.query('path == "/resources/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/2021/TOMO 39_JULIO _21/3658_39 CAUSA 134381_21.pdf"')

In [None]:
pd.set_option('display.max_colwidth', -1)
data.query('path == "/resources/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf/2022/TOMO_49_MAYO_2022/4435_49_10_05_2022_189bis  CAUSA 133492_2021.pdf"')

In [None]:
pd.reset_option('display.max_rows')
def get_unique_values(serie):
    # values = [v for v in set(serie) if v not in [None, np.nan]]
    # values = [v for v in set(serie)]
    values = list(serie)
    return values

simplified = data.groupby(['nro_registro', 'tomo', 'link']).agg(get_unique_values)

multivalue_columns = []
for column in simplified.columns:
    max_len = simplified[column].apply(len).max()
    if max_len <= 1:
        simplified[column] = simplified[column].apply(lambda x: x[0] if x else None)
    else: 
        # print('column with multiples values:', column)
        multivalue_columns.append(column)

# simplified = simplified.reset_index()
multivalue_columns

In [None]:
def _len(value):
    if isinstance(value, list):
        return len(value)
    return 1
value_sizes = simplified.applymap(_len)

In [None]:
pd.set_option('display.max_rows', None)
simplified_ = simplified.reset_index()
for column in multivalue_columns:
    print(column)
    mask = value_sizes[column] > 1
    print(simplified.loc[mask, column])
    print('----')

    


In [None]:
database = []
for (nro_registro, tomo, path), group in annotations.groupby(['nro_registro', 'tomo', 'path']):
    if len(set(group['n_expte_eje'])) > 1:
        print(f'droping document {path} (multiple n_expte_eje)')
        continue
    n_expte_eje = group['n_expte_eje'].values[0]
    d = {
        'path': path,
        'nro_registro': nro_registro,
        'tomo': tomo,
        'n_expte_eje': n_expte_eje,
        'nro_registros': len(group),
        'annotations': group.to_dict('records')
    }
    database.append(d)



In [None]:
annotations['n_expte_eje'].dropna()

In [None]:
database

In [None]:
column = 'genero_acusado/a'
mask = value_sizes[column] > 1
simplified.loc[mask, column]

In [None]:
ASSUME_UNIQUE = ['nro_registro', 'tomo', 'n_expte_eje', 'firma']

In [None]:
# pd.set_option.display.max_rows = None
pd.set_option('display.max_rows', None)
def _len(value):
    if isinstance(value, list):
        return len(value)
    return 1
value_sizes = simplified.applymap(_len)
value_sizes_max = value_sizes.max()
value_sizes_max[value_sizes_max == 1]

In [None]:
case = simplified.loc[1147]
case

# Casos con violencia

In [None]:
expedientes_ = expedientes.query("nro_registros > 1 and violence > 1")
px.histogram(
    expedientes_,
    x=expedientes_["tiempo_entre_registros_maxmin"].dt.days,
    color="nro_registros",
    title="dias entre primer y ultimo registro del caso",
)


In [None]:
expedientes_ = expedientes.query("nro_registros > 1 and violence > 1")
px.histogram(
    expedientes_,
    x=expedientes_["tiempo_entre_registros_mean"].dt.days,
    color="nro_registros",
    title="tiempo medio entre 2 registros consecutivos",
)


## modalidades

In [None]:
set(data['modalidad_de_la_violencia'].values)

In [None]:
registro = data.query('modalidad_de_la_violencia == "domestica"').iloc[300]
print(registro['path'])
registro