## Import libraries

In [None]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from utilities.data_preprocessors import simple_preprocess, preprocess, re_encode_targets
from utilities.data_loaders import load_data
from utilities.data_visualizers import view_sentence

import nltk
nltk.download('stopwords')
nltk.download('wordnet')


## load datasets hate-offensive-speech, slur-corupus, ethos, reddit

In [None]:
# load dataframes
tweets = load_data('./data/hate-speech-and-offensive-language/hate-offensive-speech.csv')
slurs = load_data('./data/slur-corpus/kurrek.2020.slur-corpus.csv', 'slur-corpus')

tweets

In [None]:
tweets['class'].dtype

In [None]:
slurs

## Explore dataset


In [None]:
print(tweets['class'].unique())
print(slurs['gold_label'].unique())

In [None]:
print(tweets['class'].value_counts())
print(slurs['gold_label'].value_counts())

## Clean data with nans

In [None]:
nan_idxs = np.where(slurs['gold_label'].isnull())[0]
nan_idxs

In [None]:
slurs.iloc[nan_idxs]

#### some rows with nans in gold_label have significant slur words attached to the comment so replace gold_label with DEG

In [None]:
slurs['body'] = slurs['body'].astype('str')
slurs['gold_label'] = slurs['gold_label'].astype('str')

# manually change gold_label of relevant comments to one of the classes DEG, NDG, HOM, CMP, and APR
slurs.loc[[5733, 31815], 'gold_label'] = 'DEG'

In [None]:
type(slurs.loc[5734, 'body'])

In [None]:
type(slurs.loc[5734, 'gold_label'])

In [None]:
slurs.iloc[nan_idxs]

#### since all gold_label with nans and with significant slurs attached to them have been modified drop the rest of the rows with both nan in comment and in gold_label

In [None]:
# identify rows with both nan in comment or label
nan_rows = np.where((slurs['gold_label'] == 'nan') | (slurs['body'] == 'nan'))[0]
nan_rows

In [None]:
# drop rows with nan in comment or label
slurs.drop(nan_rows, inplace=True)
slurs.reset_index(drop=True, inplace=True)

In [None]:
nan_idxs_new = np.where(slurs['gold_label'].isnull())[0]
slurs.iloc[nan_idxs_new]

In [None]:
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())

In [None]:
slurs

## Preprocess text
- remove trailing whitespaces
- remove non-alphanumeric characters
- lower sentences
- tokenize
- remove stop words
- lemmatize or stem word
- encode target column to numbers

<u>or use gensim.utils.simple_preprocess as callback of self.apply()</u>

In [None]:
tweets['tweet'] = tweets['tweet'].apply(preprocess)


In [None]:
tweets['tweet'] = tweets['tweet'].apply(simple_preprocess)

In [None]:
slurs['body'] = slurs['body'].apply(preprocess)

In [None]:
view_sentence(slurs['body'], limit=50)

In [None]:
slurs['body'] = slurs['body'].apply(simple_preprocess)

## Encode target/y labels/outputs/columns of the slur dataset
- recall that it has the ff labels and its respective counts
- ['DEG' 'NDG' 'HOM' 'CMP' 'APR']
- DEG    20532
- NDG    16727
- HOM     1998
- APR      553
- CMP      189

In [None]:
#  encoding labels to numbers in slur dataset
encoder = LabelEncoder()
slurs['gold_label'] = encoder.fit_transform(slurs['gold_label'])
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())

In [None]:
slurs

summary:
- DEG has label 2, NDG has label 4, HOM has label 3, APR has label 0, and CMP has label 1
- encode noise labels to just non-derogatory label instead which is 4
- or encode non-derogatory label to 1 and just think of 1 as now the non-derogatory label instead of noise

In [None]:
slurs[slurs['gold_label'] == 4] = 1
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())

## re-encode labels of hate tweet dataset
- hate class currently 0 can be lumped in with derogatory class of slur dataset, so encode to 2
- offensive class currently 1 can be lumped in with appropriative class of slur dataset, so encode to 0
- neither class currently 2 can be lumped in with non derogatory of slur dataset, so encode to 4

In [None]:
tweets['class'] = tweets['class'].apply(re_encode_targets)
tweets

In [None]:
tweets.loc[0, 'tweet']

## rename columns of both slur and tweet datasets and combine

## Split input/independent and output/dependent columns/features