In [None]:
import numpy as np
import pandas as pd
from glob import glob

In [None]:
target = pd.read_excel(
    '../../resources/data/set_de_datos_con_perspectiva_de_genero - set_de_datos_con_perspectiva_de_genero.ods',
)

target.dropna(subset=['NRO_REGISTRO', 'TOMO', 'FECHA_RESOLUCION'], inplace=True)
target.rename(columns={c: c.lower() for c in target.columns}, inplace=True)

target['tomo'] = target['tomo'].astype(int)

target

# Download data

In [None]:
import os
import gdown
import subprocess
from hashlib import blake2b
from tqdm.auto import tqdm
from joblib import Parallel, delayed

tqdm.pandas()

outdir = '/resources/data/documents'
os.makedirs(outdir, exist_ok=True)

def get_file(url: str):
    if not isinstance(url, str):
        return 

    fname = f'{outdir}/{blake2b(url.encode(), digest_size=15).hexdigest()}'

    if os.path.exists(fname):
        return fname

    # algunos archivos requieren acceso, otros estan corruptos
    cmd = f"gdown --fuzzy -q --continue -O {fname} {url}"
    # fname = gdown.download(url, fname, quiet=False, fuzzy=True, resume=True)
    output = subprocess.getoutput(cmd)
    if 'Access denied' in output:
        return 
    return fname


parallel = Parallel(n_jobs=50, backend='threading')
get_file_ = delayed(get_file)
path = parallel(get_file_(url) for url in tqdm(target['link']))
target['path'] = path


# Set date

In [None]:
def to_datetime(value):
    if isinstance(value, str):
        day, month, year = value.split('_')
        year_format = '%y' if len(year) == 2 else '%Y'
        if int(day) > 31 or int(month) > 12:
            return
        return pd.to_datetime(value, format=f'%d_%m_{year_format}', infer_datetime_format=True, errors='coerce')
    
target['date'] = target['fecha_resolucion'].apply(to_datetime)
target

In [None]:
target.to_csv('/resources/data/preprocessed.csv', index=False)