In [None]:
import numpy as np
import pandas as pd
from glob import glob
from detoxify import Detoxify

import locale 

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')

In [None]:
data = pd.read_csv('/resources/data/preprocessed.csv')
data


In [None]:
violence_cats = [
    "violencia_de_genero",
    "v_fisica",
    "v_psic",
    "v_econ",
    "v_sex",
    "v_soc",
    "v_amb",
    "v_simb",
    "v_polit",
]
bool_violence_cats = [f"have:{cat}" for cat in violence_cats]
for cat in violence_cats:
    data[f"have:{cat}"] = data[cat].apply(lambda v: v == "si")

data["have:violence"] = data[bool_violence_cats].sum(axis=1)

frases_categories = ['no_corresponde', 'no corresponde', 'sin frases', 's/d']
data['have:frase'] = data['frases_agresion'].apply(lambda s: s not in frases_categories)


# extract full text

In [None]:
import os
import textract
from zipfile import BadZipFile
import unicodedata

def get_fulltext(path: str) -> str:
    if not isinstance(path, str) or not os.path.exists(path):
        return "missing"
    try:
        docu = textract.process(path, extension='odt').decode('utf-8')
        docu = unicodedata.normalize('NFKD', docu)
        return docu
    except (BadZipFile, KeyError):
        return "corrupted"


In [None]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm

parallel = Parallel(n_jobs=10)
get_fulltext_ = delayed(get_fulltext)
data['fulltext'] = parallel(get_fulltext_(path) for path in tqdm(data['path']))

## mark corrupt or missing files

In [None]:
data['valid_file'] = ~np.logical_or(data['fulltext'] == 'corrupted', data['fulltext'] == 'missing')
data['valid_file'].sum()

## filterout invalid data

In [None]:
data.query('valid_file', inplace=True)
predict = data.copy()
data

# categorice documents with toxicity metric

In [None]:
detoxify = Detoxify('multilingual', device='cuda')

In [None]:
input_text = ['estupido', 'te amo', 'sos un pelotudo', 'eres una pelotuda', "hijo de puta", 'que boludo', 'jajaja me mata lo boludo que sos, es re gracioso']
# results = Detoxify("multilingual", device='cpu').predict(input_text)
results = detoxify.predict(input_text)
pd.DataFrame(results, index=input_text)
pd.DataFrame(results).to_dict('records')

In [None]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from more_itertools import chunked
import re

def doc_max_toxicity(text: str):
    utterances = re.split(r'\n|\.', text)
    utterances = filter(len, utterances)
    chunks = chunked(utterances, 10)
    results = pd.concat([pd.DataFrame(detoxify.predict(chunk)) for chunk in chunks])

    return results.describe().to_dict()

toxicity = [doc_max_toxicity(doc) for doc in tqdm(data['fulltext'])]
    

In [None]:
import re
idx = 5
mask = data['have:frase']
print(data.loc[mask, 'frases_agresion'].iloc[idx])
print(80*'=')
re.split(r'\n', data.loc[mask, 'fulltext'].iloc[idx])

In [None]:
detoxify.predict(['Fiscal: la presente causa se inicia con la denuncia el 18 de octubre del corriente en la Comisaría 45, de la señora L. M. R. R., quien manifiesta que el día anterior tomó conocimiento por parte de su hija E. A. S., de 17 años de edad, que dicho día se encontraba sola en el interior de un aula de la Escuela Técnica No, sita en de esta ciudad, finalizando un examen, y que su profesor de taller de nombre G. le preguntó “¿Ya tuviste relaciones sexuales?”, “¿Tenes novio?”, “¿Te parezco fachero?”; para luego referirle que le quería enseñar algo, tomar su mano y apoyarla sobre su pantalón, más precisamente sobre sus genitales, en momentos en que estaría teniendo una erección, tras lo cual la menor salió corriendo del aula.'])

In [None]:
import json

json_ = {doc: toxi for doc, toxi in zip(data['path'], toxicity)}
json_

with open('detoxify-docs-out.json', 'w') as file:
    json.dump(json_, file)

In [None]:
import json

with open('detoxify-docs-out.json', 'r') as file:
    json_ = json.load(file)

# analyse toxicity

In [None]:
def get_stat(path: str, stat: str = 'max'):
    df = pd.DataFrame(json_.get(path, []))
    if df.empty:
        return
    scores = df.loc[stat, :]
    scores['path'] = path
    return pd.DataFrame(scores).T

max_doc_scores = [get_stat(path) for path in tqdm(json_.keys())]

In [None]:
toxicity = pd.concat(max_doc_scores, ignore_index=True)
toxicity = pd.merge(toxicity, data, on='path')
toxicity

In [None]:
aux = ['no corresponde', 'no_corresponde', 'sin frases' 's/d']
toxicity['have:frase'] = toxicity['frases_agresion'].apply(lambda s: s not in aux)

In [None]:
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "notebook"

In [None]:
toxic_categories = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']
toxicity_mean = toxicity[toxic_categories].mean(axis=1)
toxicity_max = toxicity[toxic_categories].max(axis=1)
px.violin(toxicity, x='have:frase', y=toxicity_max, box=True)

In [None]:
px.scatter_matrix(toxicity, dimensions=toxic_categories, color='have:frase')

In [None]:
from umap import UMAP
import plotly.express as px

features = toxicity[toxic_categories]

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(features)
proj_3d = umap_3d.fit_transform(features)

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=toxicity['have:frase'], labels={'color': 'have:frase'}
)
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=toxicity['have:frase'], labels={'color': 'have:frase'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()
