In [50]:
import pandas as pd
import numpy as np
import os
import nltk
import json
import requests
import zipfile
import shutil

from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# Tratamento de Dados
-------
## Objetivos
```
Fazer o download dos datasets escolhidos e fazer o tratamento para desenvolvimento dos experimentos
```
## Etapas do tratamento
* `Download dos datasets`
* `Agrupar e filtrar as categorias de discurso de ódio escolhidas`
* `Separar todas as palavras de cada categoria`
* `Analise qualtiativa das palavras mais importantes`
## Classes de discurso de ódio analisdas
| Classe    |
|-----------|
| Racismo   |
| Sexismo   | 
| Homofobia |

Também serão consideradas as amostras sem discurso de ódio

## Dataset: Fortuna et Al, 2019
### A hierarchically-labeled portuguese hate speech dataset
```
FORTUNA, Paula et al. A hierarchically-labeled portuguese hate speech dataset. In: Workshop on Abusive Language Online, 3º, 2019, Florence. Proceedings of the Third Workshop on Abusive Language Online. Florence: ACL | ALW | WS, 2019. p. 94-104.
```

In [3]:
dataset_url = 'https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv'
classes_url = 'https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/graph_hierarchical_classes.csv'

In [63]:
dataset = pd.read_csv(dataset_url)
classes = pd.read_csv(classes_url)
# print(dataset.columns[1:])
target_classes = ['Sexism', 'Homophobia', 'Racism']

# filter only non hate speech samples
final_dataset = dataset[dataset['Hate.speech'] == 0]

# add target classes
for _class in target_classes:
    final_dataset = pd.concat(
        [final_dataset, dataset[dataset[_class] == 1].copy()]
    )

# removen unsude columns
final_dataset = final_dataset[['text', 'Hate.speech', *target_classes]]

In [64]:
# save the dataset in disk
final_dataset.to_csv(os.path.join('data', 'fortuna', 'fortuna.csv'))

In [74]:
 # get just words
fortuna_words = {}

for _class in ['Hate.speech', *target_classes]:
    i = 0 if _class == 'Hate.speech' else 1
    df = final_dataset[final_dataset[_class] == i]
    df.apply((
        lambda x: fortuna_words.setdefault(_class, []).extend(
            nltk.word_tokenize(x['text'])
        )
    ), axis='columns')

In [78]:
# write words in disk
with open(os.path.join('data', 'fortuna', 'words.json'), 'w') as f:
    json.dump(fortuna_words, f, indent=4)

## Dataset:Ousidhoum, 2019
###  Multilingual and multi-aspect hate speech analysis
```
OUSIDHOUM, Nedjma et al. Multilingual and multi-aspect hate speech analysis. arXiv preprint arXiv:1908.11049, 2019.
```

In [27]:
# download dataset from gihub
url = "https://github.com/HKUST-KnowComp/MLMA_hate_speech/blob/master/hate_speech_mlma.zip?raw=true"

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open('temp.zip', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

with zipfile.ZipFile('temp.zip', 'r') as zip:
    zip.extractall()
    
dataset = pd.read_csv(os.path.join("hate_speech_mlma", "en_dataset.csv"))

shutil.rmtree('hate_speech_mlma')
os.remove('temp.zip')

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   HITId                5647 non-null   int64 
 1   tweet                5647 non-null   object
 2   sentiment            5647 non-null   object
 3   directness           5647 non-null   object
 4   annotator_sentiment  5647 non-null   object
 5   target               5647 non-null   object
 6   group                5647 non-null   object
dtypes: int64(1), object(6)
memory usage: 308.9+ KB

{
        'sexual_orientation': 'Homophobia',
        'gender': 'Sexism',
        'orgin': 'Racism'
    },
```

In [56]:
# filter target class from dataset
target_classes = ['sexual_orientation', 'gender', 'origin']
final_dataset = dataset[dataset['target'].isin(target_classes)][['tweet', 'target']]

In [57]:
encoder = ce.OneHotEncoder()

In [58]:
oh = encoder.fit_transform(final_dataset[['target']])

In [62]:
final_dataset[['target']]

Unnamed: 0,target
0,origin
3,origin
4,gender
5,origin
6,origin
...,...
5639,origin
5640,origin
5641,origin
5643,origin
