In [None]:
%load_ext autoreload
%autoreload 1
%aimport aymurai.pattern.fuzzymatch
%aimport aymurai.pattern.fuzzytagging

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import spacy
from spacy import displacy

import locale 

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')

In [None]:
data = pd.read_csv('/resources/data/preprocessed.csv')
data


In [None]:
violence_cats = [
    "violencia_de_genero",
    "v_fisica",
    "v_psic",
    "v_econ",
    "v_sex",
    "v_soc",
    "v_amb",
    "v_simb",
    "v_polit",
]
bool_violence_cats = [f"have:{cat}" for cat in violence_cats]
for cat in violence_cats:
    data[f"have:{cat}"] = data[cat].apply(lambda v: v == "si")

data["have:violence"] = data[bool_violence_cats].sum(axis=1)

frases_categories = ['no_corresponde', 'no corresponde', 'sin frases', 's/d']
data['have:frase'] = data['frases_agresion'].apply(lambda s: s not in frases_categories)


# extract full text

In [None]:
import os
import textract
from zipfile import BadZipFile
import unicodedata

def get_fulltext(path: str) -> str:
    if not isinstance(path, str) or not os.path.exists(path):
        return "missing"
    try:
        docu = textract.process(path, extension='odt').decode('utf-8')
        docu = unicodedata.normalize('NFKD', docu)
        return docu
    except (BadZipFile, KeyError):
        return "corrupted"


In [None]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm

parallel = Parallel(n_jobs=10)
get_fulltext_ = delayed(get_fulltext)
data['fulltext'] = parallel(get_fulltext_(path) for path in tqdm(data['path']))

## mark corrupt or missing files

In [None]:
data['valid_file'] = ~np.logical_or(data['fulltext'] == 'corrupted', data['fulltext'] == 'missing')
data['valid_file'].sum()

## filterout invalid data

In [None]:
data.query('valid_file', inplace=True)
predict = data.copy()
data

In [None]:
register = data.loc[696]
register

In [None]:
from aymurai.pattern.fuzzytagging import FuzzyEntityTagger, FuzzyDateEntityTagger

es = spacy.blank('es')
nlp = spacy.load('es_dep_news_trf')

In [None]:
text = register['frases_agresion']
# text = text_normalize(text)
quotes = nlp(text)

print(list(quotes.sents))
displacy.render(quotes, 'dep', options={'collapse_punct': False, 'compact': True})

In [None]:
tagger = FuzzyEntityTagger()
tagger.add('agression_quote', [str(quote) for quote in quotes.sents])
tagger.add('n_expte_eje', [register['n_expte_eje'].replace('_', '/')])
tagger.add('firma', [register['firma'].replace('_', '/'), 'Pablo C. Casas'])
tagger.add('materia', ['contravencional'])

In [None]:
date_tagger = FuzzyDateEntityTagger()
date_tagger.add('fecha_resolucion', [r'%-d de %B de %Y', r'%d de %B de %Y'], reference=pd.to_datetime(register['date']), dd_max=pd.Timedelta(5, unit='D'))
date_tagger.add('hora_de_inicio', [r'%H[\.:]%M horas'], reference=pd.Timestamp(f"1900/01/01 {register['hora_de_inicio']}"), dd_max=pd.Timedelta(5, unit='m'))
date_tagger.add('hora_de_cierre', [r'%H[\.:]%M horas'], reference=pd.Timestamp(f"1900/01/01 {register['hora_de_cierre']}"), dd_max=pd.Timedelta(5, unit='m'))
# date_tagger.add('hora_inicio', [r'%H[\.:]%M horas'])
# date_tagger.add('date', [r'%d de %B de %Y'], pd.Timestamp('11/03/2017'))

In [None]:
doc = es(register['fulltext'])
doc = tagger(doc)
doc = date_tagger(doc)
displacy.render(doc, 'ent')

In [None]:
len(doc)

In [None]:
data.query('n_expte_eje == "20751_2017"')

In [None]:
a = data.loc[2141]
doc = es(a['fulltext'])
doc = tagger(doc)
doc = date_tagger(doc)
displacy.render(doc, 'ent')