# Unified dataset analysis

In [33]:
import sys
import pandas as pd
import numpy as np
import os.path
import re
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
from collections import Counter
from string import punctuation
from itertools import chain
import emoji

## Loading dataset

In [34]:
# Constants 
DATASET_FULLNAMES = {
    'conan': 'Multi-Target Counter Narrative Dataset to Fight Online Hate Speech (2021, Fanton et al.)',
    'mlma': 'Multilingual and Multi-Aspect Hate Speech Analysis (2019, Ousidhoum et al.)',
    'mmhs150k': 'Exploring Hate Speech Detection in Multimodal Publications (2019, Gomez et al.)',
    'waseem_hoovy': 'Are You a Racist or Am I Seeing Things? (2016, Waseem)',
    'waseem': 'Hateful Symbols or Hateful People? (2016, Waseem and Hovy)',
    'data_society': 'Data Society Twitter User Gender Classification',
    'twitter_kaggle': 'Twitter Sentiment Analysis (Analytics Vidhya)'
}

DATASET_SHORTNAMES = {
    'conan': 'Fanton et al. (2021)',
    'mlma': 'Ousidhoum et al. (2019)',
    'mmhs150k': 'Gomez et al. (2019)',
    'waseem_hoovy': 'Waseem and Hovy (2016)',
    'waseem': 'Waseem (2016)',
    'data_society': 'Kaggle (2016)',
    'twitter_kaggle': 'Analytics Vidhya (2019)'
}

In [35]:
# Load dataset
DATASET_PATH = '../datasets/unified_dataset.csv'

df_unified = pd.read_csv(DATASET_PATH, encoding='utf-8')

# Change original dataset names 
# df_unified['dataset_name'] = df_unified.dataset_name.apply(lambda x: DATASET_SHORTNAMES[x])

df_unified.head()

Unnamed: 0,text,target,dataset_name,original_target
0,‘Juice (Jews) are worse that nukes.’ Caption t...,religion,Fanton et al. (2021),JEWS
1,Foreigners on UK benefits leaps 41% in 5 years...,origin,Fanton et al. (2021),MIGRANTS
2,"Jews cannot be patriots, since their allegianc...",religion,Fanton et al. (2021),JEWS
3,"Israel, Zionism and Jewish Supremacist control...",religion,Fanton et al. (2021),JEWS
4,"Women are basically childlike, they remain thi...",gender,Fanton et al. (2021),WOMEN


In [36]:
df_unified.describe()

Unnamed: 0,text,target,dataset_name,original_target
count,90871,90871,90871,90871
unique,90871,6,7,24
top,‘Juice (Jews) are worse that nukes.’ Caption t...,non hate,Analytics Vidhya (2019),0
freq,1,56796,27491,27491


## Hate target (category) distribution

In [37]:
df_groupby = df_unified.groupby(['target'])['text'].count()

labels = df_groupby.keys()
values = df_groupby.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(title_text="Hate target (category) distribution")
fig.show()

## Dataset composition (by original dataset)

In [38]:
df_groupby = df_unified.groupby(['dataset_name'])['text'].count()

labels = df_groupby.keys()
values = df_groupby.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
# fig.update_layout(title_text="Dataset composition (by original dataset) [Pie Chart]")
fig.show()

In [39]:
df_unified_grouped = df_unified.groupby(['dataset_name', 'target', 'original_target']).count()
df_unified_grouped_table = df_unified_grouped.add_suffix('_Count').reset_index()

d_colors = {
    'deleted': '#EBEBEB', 
    'disability': '#08A4B1', 
    'gender': '#F5CBDD', 
    'non hate': '#CAE5C3', 
    'origin': '#FFB770', 
    'other': '#C3C7C9', 
    'religion': '#A5CDE8', 
    'sexual_orientation': '#EF517F'
}

d_name = dict([(y,x) for x,y in enumerate(sorted(set(df_unified_grouped_table['dataset_name'])))])
df_unified_grouped_table['dn'] = [d_name[key] for key in df_unified_grouped_table['dataset_name']]
d_target = dict([(y,x + max(df_unified_grouped_table['dn']) + 1) 
                 for x,y in enumerate(sorted(set(df_unified_grouped_table['target'])))])
df_unified_grouped_table['t'] = [d_target[key] for key in df_unified_grouped_table['target']]
df_unified_grouped_table['colors'] = [d_colors[key] for key in df_unified_grouped_table['target']]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 30,
      thickness = 20,
      label =  [*d_name.keys(), *d_target.keys()],
      color = "#5985D0"
    ),
    link = dict(
      source =  df_unified_grouped_table.dn,
      target =  df_unified_grouped_table.t,
      value =  df_unified_grouped_table['text_Count'],
      color =  df_unified_grouped_table['colors']
        
))])

fig.update_layout(title_text="Dataset composition (by original dataset) [Sankey diagram]")
fig.show()

## Dataset composition by category (including vs. excluding 'non hate')

In [40]:
df_unified_grouped = df_unified.groupby(['dataset_name', 'target', 'original_target']).count()
df_unified_grouped_table = df_unified_grouped.add_suffix('_Count').reset_index()
df_unified_grouped_table.original_target = [" " + ot for ot in df_unified_grouped_table.original_target]

labels = [*sorted(df_unified_grouped_table.target.unique()), *df_unified_grouped_table.original_target]
parents = [*["dataset" for x in range(len(df_unified_grouped_table.target.unique()))], *df_unified_grouped_table.target]
t = pd.array(df_unified_grouped_table.groupby(['target'])['text_Count'].sum())
values = [*t, *df_unified_grouped_table.text_Count]

sunburst_df = pd.DataFrame({'labels': labels, 'parents': parents, 'amount': values})

fig = go.Figure()

fig.add_trace(go.Sunburst(
    labels=sunburst_df.labels,
    parents=sunburst_df.parents,
    values=sunburst_df.amount,
    branchvalues="total",
    domain=dict(column=0)
))

# Dropping "non hate" for better visibility
sunburst_df = sunburst_df.drop( sunburst_df[(sunburst_df.parents == 'non hate') | (sunburst_df.labels == 'non hate')].index)

fig.add_trace(go.Sunburst(
    labels=sunburst_df.labels,
    parents=sunburst_df.parents,
    values=sunburst_df.amount,
    branchvalues="total",
    domain=dict(column=1)
))

fig.update_layout(
    title_text="Dataset composition by category (including vs. excluding 'non hate')",
    grid= dict(columns=2, rows=1),
    margin = dict(t=100, l=0, r=0, b=0),
)

fig.show()

## Preprocessing

### Removing handles

We removed handles specific for tweets such as retweet marks (`RT`) and user tags (`@user`). These handles, especially user tags could be used, to some extend, as hints/features indicating hate, however presence of these tags is not significant and consistent across individual original datasets, large portion is also anonymised (e.g. `@user`) and so on.

From results showing 10 most common handles, we are able to see possible connection of hate speech to controversial Twitter accounts or accounts representing some specific topic, e.g. political figures or religions.

In [41]:
def removeHandles(text):
    pattern = "RT|@[\w]+"
    return re.sub(pattern, "", text)

all_handles = []
for index, row in df_unified.iterrows():
    handles = re.findall("RT|@[\w]+", row.text)
    all_handles.extend(handles)

df_unified['clean_text'] = df_unified.text.apply(lambda x: removeHandles(x))
Counter(all_handles).most_common(10)

[('@user', 19180),
 ('RT', 3918),
 ('@URL', 1908),
 ('@realDonaldTrump', 297),
 ('@MaxBlumenthal', 259),
 ('@mykitchenrules', 204),
 ('@freebsdgirl', 185),
 ('@greenlinerzjm', 165),
 ('@IsraeliRegime', 144),
 ('@onedirection', 140)]

### Remove URLs

We removed URLs from texts since majority of them was anonymized (e.g. URLs were redirected through Twitter URL shortener which hides original URL). 

Twitter subdomain URLs like `t.co` could be retweets, replies or external link - we can't accurately tell the difference. They also mostly come from `Gomez et al. (2019)` dataset probably due to crawling technique or it was not preprocessed so we can't count links as some kind of relevant feature for hate indication. 

In [42]:
def find_urls(text):
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(pattern, text)
    return len(urls)

def remove_urls(text):
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return " ".join(re.sub(pattern, "", text).split())

df_unified['urls'] = df_unified['clean_text'].apply(lambda x: find_urls(x))
df_unified['clean_text'] = df_unified['clean_text'].apply(lambda x: remove_urls(x))

print(df_unified.groupby(['target'])['urls'].sum())
df_unified = df_unified.drop(columns=['urls'])

target
disability                0
gender                 6828
non hate              10321
origin                15025
religion                388
sexual_orientation        0
Name: urls, dtype: int64


## Punctuation

We counted punctuation in text, especially exclamation marks and question marks, which are believed to be used more extensively in hate speech. We can notice some differences across categories. We keep punctuation because it can help with context in models like BERT. 

In [43]:
df_unified['exclamationm_count'] = df_unified.clean_text.apply(lambda x: x.count('!'))
df_unified['questionm_count'] = df_unified.clean_text.apply(lambda x: x.count('?'))

df_sum_exclm = df_unified.groupby(['target'])['exclamationm_count'].apply(lambda x: x[x > 0].count())
df_sum_questm = df_unified.groupby(['target'])['questionm_count'].apply(lambda x: x[x > 0].count())

df_count = df_unified.groupby(['target']).count()

# Remove punctuation
# table = str.maketrans('', '', punctuation)
# df_unified.clean_text = df_unified.clean_text.apply(lambda x: x.translate(table))
df_unified = df_unified.drop(columns=['questionm_count', 'exclamationm_count'])

pd.DataFrame(data={
    '% with "!"': df_sum_exclm*100/df_count.text,
    '% with "?"': df_sum_questm*100/df_count.text
})

Unnamed: 0_level_0,"% with ""!""","% with ""?"""
target,Unnamed: 1_level_1,Unnamed: 2_level_1
disability,4.03481,7.753165
gender,9.484161,9.742559
non hate,18.488978,8.150222
origin,8.836758,8.744761
religion,5.668685,5.998899
sexual_orientation,5.209397,4.392237


## Splitting to words/tokens

We performed intial text preprocessing - lowercase, removing stopwords and non-alphabetical characters and split text to individual words/tokens.

In [44]:
# Lowercase
df_unified['clean_text'] = df_unified.clean_text.apply(lambda x: x.lower())

# Tokens
df_unified['words'] = df_unified.clean_text.apply(lambda x: word_tokenize(x))

# Remove non-alphabetic characters (, . ' # ...)
df_unified['words'] = df_unified.words.apply(lambda x: [word for word in x if word.isalpha()])

# Removing stop words
stop_words = stopwords.words('english') 
df_unified['words'] = df_unified.words.apply(lambda x: [word for word in x if not word in stop_words])

df_unified.head()

Unnamed: 0,text,target,dataset_name,original_target,clean_text,words
0,‘Juice (Jews) are worse that nukes.’ Caption t...,religion,Fanton et al. (2021),JEWS,‘juice (jews) are worse that nukes.’ caption t...,"[juice, jews, worse, caption, meme, showing, h..."
1,Foreigners on UK benefits leaps 41% in 5 years...,origin,Fanton et al. (2021),MIGRANTS,foreigners on uk benefits leaps 41% in 5 years...,"[foreigners, uk, benefits, leaps, years, hande..."
2,"Jews cannot be patriots, since their allegianc...",religion,Fanton et al. (2021),JEWS,"jews cannot be patriots, since their allegianc...","[jews, patriots, since, allegiance, always, st..."
3,"Israel, Zionism and Jewish Supremacist control...",religion,Fanton et al. (2021),JEWS,"israel, zionism and jewish supremacist control...","[israel, zionism, jewish, supremacist, control..."
4,"Women are basically childlike, they remain thi...",gender,Fanton et al. (2021),WOMEN,"women are basically childlike, they remain thi...","[women, basically, childlike, remain, way, liv..."


## Avg. number of words and text length 

Average number of words and text length per category didn't reveal any significant differences. We also noticed that number of words averages (performed on words cleaned from stopwords and non-alphabetic symbols) proportionally differ from raw text length averages (unprocessed original text) e.g. `non hate` have lowest avg. num. of words but almost highest avg. raw text length. Possible explanation is that in `non hate` posts, users use less individual words and e.g. more emojis, links and so on or they may also use longer words.

In [45]:
df_unified['word_count'] = df_unified.words.apply(lambda x: len(x))
word_count_mean = df_unified.groupby(['target']).word_count.mean()

df_unified['text_length'] = df_unified.text.apply(lambda x: len(x))
text_length_mean = df_unified.groupby(['target']).text_length.mean()

fig = go.Figure(go.Table(
        header=dict(values=['Target', 'Avg. num. of words', 'Avg. raw text length']),
        cells=dict(values=[
            word_count_mean.keys(), 
            round(word_count_mean, 2),
            round(text_length_mean, 2),
        ])
    ))

fig.update_layout(
    title_text="Average number of words and text length"
)

fig.show()

## Word frequency

We counted most common words (on already preprocessed words/token). Results showed that hate speech, as expected, contains wide variety of swear, racist and homophobic words targeting individual people or groups. However, we also observed that e.g. racial slurs may have not been always used in hateful context since frequency of these words is larger than whole `origin` subcategory.  

In [46]:
all_words = pd.Series(list(chain.from_iterable(df_unified['words'])))

word_frequency_table = pd.DataFrame(Counter(all_words).most_common(20),
                             columns=['word', 'frequency'])

In [47]:
fig1 = go.Figure(data=[go.Table(header=dict(values=['Word', 'Frequency']),
                 cells=dict(values=[word_frequency_table.word, word_frequency_table.frequency]))
                     ])
fig1.update_layout(
    title_text="Word frequency table"
)

fig2 = go.Figure(go.Bar(
            x=word_frequency_table.frequency.sort_index(ascending=False),
            y=word_frequency_table.word.sort_index(ascending=False),
            orientation='h'))
fig2.update_layout(
    title_text="Word frequency barchart"
)

fig1.show()
fig2.show()

## Word frequency per category

As we see on results, word frequency by category seems like a good indicator of hate speech targeted on certain category. We see that most common words in respective hate targets/categories are indeed being frequently used in these types of hate posts.

In [48]:
categories = df_unified.target.unique()

for category in categories:
    df_category = df_unified[df_unified['target'] == category]
    category_words = pd.Series(list(chain.from_iterable(df_category['words'])))
    word_frequency_table = pd.DataFrame(Counter(category_words).most_common(10),
                                columns=['word', 'frequency'])

    fig = go.Figure(go.Bar(
                x=word_frequency_table.frequency.sort_index(ascending=False),
                y=word_frequency_table.word.sort_index(ascending=False),
                orientation='h'))    
    fig.update_layout(
        title_text=f"'{category}' word frequency",
    )
    fig.show()

## N-grams

We performed term frequency using bigrams and trigrams. Results shows that it seems like a very good indicator in identification of hate speech targets (or hate speech in general) since bigrams and trigrams also consider context of swear words used (as oppose to basic keyword spotting) - one or two words behind or after some swear word which helps to identify if word was used deliberately in a harmful way or e.g. as a part of slang communication.

### Bi-grams

In [49]:
df_unified['bigrams'] = df_unified.words.apply(lambda x: list(ngrams(x, 2)))
all_bigrams = pd.Series(list(chain.from_iterable(df_unified['bigrams'])))

bigrams_frequency_table = pd.DataFrame(Counter(all_bigrams).most_common(20),
                             columns=['bigram', 'frequency'])
bigrams_frequency_table['bigram'] = bigrams_frequency_table.bigram.apply(lambda x: " ".join(x))

In [50]:
fig1 = go.Figure(data=[go.Table(header=dict(values=['Bigram', 'Frequency']),
                 cells=dict(values=[bigrams_frequency_table.bigram, bigrams_frequency_table.frequency]))
                     ])
fig1.update_layout(
    title_text="Bigram frequency table",
)

fig2 = go.Figure(go.Bar(
            x=bigrams_frequency_table.frequency.sort_index(ascending=False),
            y=bigrams_frequency_table.bigram.sort_index(ascending=False),
            orientation='h'))
fig2.update_layout(
    title_text="Bigram frequency barchart",
)

fig1.show()
fig2.show()

### Bi-grams per category

In [51]:
categories = df_unified.target.unique()

for category in categories:
    df_category = df_unified[df_unified['target'] == category]
    category_bigrams = pd.Series(list(chain.from_iterable(df_category['bigrams'])))
    bigrams_frequency_table = pd.DataFrame(Counter(category_bigrams).most_common(10),
                             columns=['bigram', 'frequency'])
    bigrams_frequency_table['bigram'] = bigrams_frequency_table.bigram.apply(lambda x: " ".join(x))

    fig = go.Figure(go.Bar(
                x=bigrams_frequency_table.frequency.sort_index(ascending=False),
                y=bigrams_frequency_table.bigram.sort_index(ascending=False),
                orientation='h'))    
    fig.update_layout(
        title_text=f"'{category}' bigram frequency",
    )
    fig.show()

### Tri-grams

In [52]:
df_unified['trigrams'] = df_unified.words.apply(lambda x: list(ngrams(x, 3)))
all_trigrams = pd.Series(list(chain.from_iterable(df_unified['trigrams'])))

trigrams_frequency_table = pd.DataFrame(Counter(all_trigrams).most_common(20),
                             columns=['trigram', 'frequency'])
trigrams_frequency_table['trigram'] = trigrams_frequency_table.trigram.apply(lambda x: " ".join(x))

In [53]:
fig1 = go.Figure(data=[go.Table(header=dict(values=['Trigram', 'Frequency']),
                 cells=dict(values=[trigrams_frequency_table.trigram, trigrams_frequency_table.frequency]))
                     ])
fig1.update_layout(
    title_text="Trigram frequency table",
)

fig2 = go.Figure(go.Bar(
            x=trigrams_frequency_table.frequency.sort_index(ascending=False),
            y=trigrams_frequency_table.trigram.sort_index(ascending=False),
            orientation='h'))
fig2.update_layout(
    title_text="Trigrams frequency barchart",
)

fig1.show()
fig2.show()

### Tri-grams per category

In [54]:
categories = df_unified.target.unique()

for category in categories:
    df_category = df_unified[df_unified['target'] == category]
    category_trigrams = pd.Series(list(chain.from_iterable(df_category['trigrams'])))
    bigrams_frequency_table = pd.DataFrame(Counter(category_trigrams).most_common(10),
                             columns=['trigram', 'frequency'])
    bigrams_frequency_table['trigram'] = bigrams_frequency_table.trigram.apply(lambda x: " ".join(x))

    fig = go.Figure(go.Bar(
                x=bigrams_frequency_table.frequency.sort_index(ascending=False),
                y=bigrams_frequency_table.trigram.sort_index(ascending=False),
                orientation='h'))    
    fig.update_layout(
        title_text=f"'{category}' trigram frequency",
    )
    fig.show()

## Stemmer

We also performed frequency on word stems which improved frequency precision of certain words, especially adjectives. We compared stem vs. word frequency in a table.

In [55]:
porterStemmer = PorterStemmer()
df_unified['stemmed'] = df_unified.words.apply(lambda x: [porterStemmer.stem(word) for word in x])

all_stems = pd.Series(list(chain.from_iterable(df_unified['stemmed'])))

stem_frequency_table = pd.DataFrame(Counter(all_stems).most_common(10),
                             columns=['stem', 'frequency'])

In [56]:
fig = go.Figure(go.Table(
        header=dict(values=['Stem', 'Frequency', 'Word', 'Frequency']),
        cells=dict(values=[
            stem_frequency_table.stem, 
            stem_frequency_table.frequency,
            word_frequency_table.word,
            word_frequency_table.frequency
        ])
    ))
fig.update_layout(
    title_text="Word vs. word stem frequency comparison",
)

fig.show()

### Stems per category

In [57]:
categories = df_unified.target.unique()

for category in categories:
    df_category = df_unified[df_unified['target'] == category]
    porterStemmer = PorterStemmer()

    category_stems = pd.Series(list(chain.from_iterable(df_category['stemmed'])))
    stem_frequency_table = pd.DataFrame(Counter(category_stems).most_common(10),
                             columns=['stem', 'frequency'])

    fig = go.Figure(go.Bar(
                x=stem_frequency_table.frequency.sort_index(ascending=False),
                y=stem_frequency_table.stem.sort_index(ascending=False),
                orientation='h'))    
    fig.update_layout(
        title_text=f"'{category}' stem frequency",
    )
    fig.show()

## Hashtags

Hashtags also seems like a good indication of targeted hate speech, however a lot of them are connected to some currently ongoing event which will likely disapear in the future so usage of such feature in model would benefit only for limited period of time. Since hashtags change frequently model would have to be retrained regurarly, which is probably also neccesary for regular text however people language changes a lot slower than hashtags.

In [58]:
hashtag_regex = re.compile(r"#(\w+)")
df_unified['hashtags'] = df_unified.text.apply(lambda x: hashtag_regex.findall(x))

all_hashtags = pd.Series(list(chain.from_iterable(df_unified['hashtags'])))

hashtag_frequency_table = pd.DataFrame(Counter(all_hashtags).most_common(20),
                             columns=['hashtag', 'frequency'])

In [59]:
fig = go.Figure(go.Bar(
            x=hashtag_frequency_table.frequency.sort_index(ascending=False),
            y=hashtag_frequency_table.hashtag.sort_index(ascending=False),
            orientation='h'))    
fig.show()

### Hashtags per category

In [60]:
categories = df_unified.target.unique()

for category in categories:
    df_category = df_unified[df_unified['target'] == category]
    category_hashtags = pd.Series(list(chain.from_iterable(df_category['hashtags'])))
    hashtag_frequency_table = pd.DataFrame(Counter(category_hashtags).most_common(10),
                             columns=['hashtag', 'frequency'])

    fig = go.Figure(go.Bar(
                x=hashtag_frequency_table.frequency.sort_index(ascending=False),
                y=hashtag_frequency_table.hashtag.sort_index(ascending=False),
                orientation='h'))    
    fig.update_layout(
        title_text=f"'{category}' hashtag frequency",
    )
    fig.show()

### Emojis

In [61]:
all_emojis = []
all_emojis_text = []

def parseEmojis(text):
    for word in text.split(' '):
        if emoji.is_emoji(word):
            all_emojis.append(word)
            emoji_text = emoji.demojize(word)
            all_emojis_text.append(emoji_text)
        else:
            return None

df_unified.clean_text.apply(lambda x: parseEmojis(x))

# Substitute emojis with textual form
df_unified['clean_text'] = df_unified.clean_text.apply(lambda x: emoji.demojize(x))

emoji_frequency_table = pd.DataFrame(Counter(all_emojis).most_common(10),
                         columns=['emoji', 'frequency'])
fig = go.Figure(go.Bar(
            x=emoji_frequency_table.frequency.sort_index(ascending=False),
            y=emoji_frequency_table.emoji.sort_index(ascending=False),
            orientation='h'))    
fig.update_layout(title_text="emoji frequency")
fig.show()

In [62]:
# Making more cleaned 'clean_text_stems' column
df_unified['clean_text_stems'] = df_unified.stemmed.apply(lambda x: ' '.join(x))

# Dropping unnecessary columns
df_unified = df_unified.drop(
    ['words', 'word_count', 'text_length', 'bigrams', 'trigrams', 'stemmed', 'hashtags', 'original_target'],
    axis=1)

In [63]:
df_unified.isnull().sum(axis = 0)


text                0
target              0
dataset_name        0
clean_text          0
clean_text_stems    0
dtype: int64

In [64]:
df_unified.to_csv('../datasets/unified_dataset_preprocessed.csv', index=False)