In [None]:
import numpy as np
import pandas as pd
from glob import glob

import locale 

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')

In [None]:
data = pd.read_csv('/resources/data/preprocessed.csv')
data


# extract full text

In [None]:
import os
import textract
from zipfile import BadZipFile

def get_fulltext(path: str) -> str:
    if not isinstance(path, str) or not os.path.exists(path):
        return "missing"
    try:
        return textract.process(path, extension='odt').decode('utf-8')
    except (BadZipFile, KeyError):
        return "corrupted"


In [None]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm

parallel = Parallel(n_jobs=10)
get_fulltext_ = delayed(get_fulltext)
data['fulltext'] = parallel(get_fulltext_(path) for path in tqdm(data['path']))

## mark corrupt or missing files

In [None]:
data['valid_file'] = ~np.logical_or(data['fulltext'] == 'corrupted', data['fulltext'] == 'missing')
data['valid_file'].sum()

## filterout invalid data

In [None]:
data.query('valid_file', inplace=True)
predict = data.copy()
data

In [None]:
pd.isna(pd.to_datetime('1/2/2009', format="%d/%m/%Y", errors="coerce"))

In [None]:
import unicodedata

def text_normalize(text: str) -> str:
    text = unicodedata.normalize('NFC', text)
    chars = list(text)
    text = ''.join(char for char in chars if unicodedata.category(char) not in ['Lo', 'So', 'Po', 'C'])

    return text

In [None]:
import re
import pandas as pd
import datetime

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')


def compile_regex_list(regex: list[str]):
    return re.compile('|'.join(regex))

REGEX_DATE = compile_regex_list(
    [
        r"(?P<day>\d\d?)\s+de\s+(?P<month>\w+)\s+de\s+(?P<year>\d\d\d?\d?)"
    ]
)

def to_datetime(str_date: str, formats: list[str]):
    for format_ in formats:
        date = pd.to_datetime(str_date, format=format_, errors='coerce')
        if not pd.isna(date):
            return date
    return pd.NaT
        


def date_getter(text: str):
    text = text_normalize(text)
    matches = REGEX_DATE.findall(text)
    if not matches:
        return
    str_dates = ['/'.join(match) for match in matches]
    dates = [to_datetime(str_date, formats=['%d/%B/%Y', '%d/%B/%y']) for str_date in str_dates]
    return dates[0]
    

predict['date'] = data['fulltext'].apply(date_getter)

### check errors

In [None]:
mask = data['date'] == predict['date']
not_matched = data.loc[~mask]
print(f'not matched: {len(not_matched)} from {len(data)} ({100*len(not_matched)/len(data):.2f}%)')

In [None]:
text = """Buenos Aires,   1º de noviembre de 2017.
Para resolver en la causa Nº 20221/15, en trámite por ante este Juzgado en lo Penal, Contravencional y de Faltas N° 10, a mi cargo, caratulada “Legajo de juicio en autos R. E. J. Cesar s/ inf. art. 149 bis - CP”.
Antecedentes del caso
ANTECEDENTES:
"""
print(unicodedata.normalize('NFKD', text))
text.replace('º', '')
matches = REGEX_DATE.findall(text)
print(matches)
str_dates = ['/'.join(match) for match in matches]
print(str_dates)
dates = [to_datetime(str_date, formats=['%d/%B/%Y', '%d/%B/%y']) for str_date in str_dates]
print(dates)


In [None]:
# index with human errors: 
# fecha mal tipeada (prediccion correcta): 
# >> la mayoria tienen un offset de pocos dias, puede ser la fecha en que se subio la resolucion?
# 372, 689, 690, 691, 692, 699, 757, 774, 790, 5241
#
# no tipeado el dia: 674, 765, 767, 772, 780, 799, 800, 802, 808, 809, 818, 989
# no tiene fecha: 984
# fecha anonimizada: 5199
idx = -80
row = not_matched.iloc[idx]
row = pd.DataFrame(row).T
print(row.index.values)
print('date', row['date'].values)
print('fecha resolucion:', row['fecha_resolucion'].values)
print('prediction', date_getter(row['fulltext'].iloc[0]))
print()
print(row['fulltext'].iloc[0])
# print(text_normalize(row['fulltext'].iloc[0]))

In [None]:
date = REGEX_DATE.findall(data['fulltext'].iloc[3])
date

In [None]:
data['fulltext'].iloc[3]