In [13]:
import pandas as pd
import numpy as np
import os.path
import plotly.graph_objects as go
from collections import Counter
from nltk.corpus import stopwords

In [14]:
SPLIT_DATASETS_PATH = '../datasets/split/'

dataset = {
    'conan': None,
    'mlma': None,
    'mmhs150k': None,
    'waseem_hoovy': None,
    'waseem': None,
    'data_society': None,
    'twitter_kaggle': None
}

DATASET_FULLNAMES = {
    'conan': 'Multi-Target Counter Narrative Dataset to Fight Online Hate Speech (2021, Fanton et al.)',
    'mlma': 'Multilingual and Multi-Aspect Hate Speech Analysis (2019, Ousidhoum et al.)',
    'mmhs150k': 'Exploring Hate Speech Detection in Multimodal Publications (2019, Gomez et al.)',
    'waseem': 'Are You a Racist or Am I Seeing Things? (2016, Waseem)',
    'waseem_hoovy': 'Hateful Symbols or Hateful People? (2016, Waseem and Hovy)',
    'data_society': 'Data Society Twitter User Gender Classification',
    'twitter_kaggle': 'Twitter Sentiment Analysis (Analytics Vidhya)'
}

DATASET_SHORTNAMES = {
    'conan': 'Fanton et al. (2021)',
    'mlma': 'Ousidhoum et al. (2019)',
    'mmhs150k': 'Gomez et al. (2019)',
    'waseem': 'Waseem (2016)',
    'waseem_hoovy': 'Waseem and Hovy (2016)',
    'data_society': 'Kaggle (2016)',
    'twitter_kaggle': 'Analytics Vidhya (2019)'
}

# Loading datasets from csv
for key in dataset.keys():
    dataset[key] = pd.read_csv(
        SPLIT_DATASETS_PATH + key + '.csv', 
        delimiter="\t", 
        header=0,
        quotechar='"',
        names=['text', 'target']
    )

In [15]:
# Initial cleanup
for key in dataset.keys():
    
    # Removing NaN/Null columns
    nan_count = len(dataset[key]) - len(dataset[key].dropna())
    if nan_count > 0: print(f'NaNs in {key}: {nan_count}')
    dataset[key] = dataset[key].dropna()
    
    # Trimming whitespaces in 'text' column
    dataset[key]['text'] = dataset[key]['text'].map(lambda x: x.strip())

    # Removing empty 'text' columns
    empty_count = dataset[key][dataset[key]['text'] == '']
    if len(empty_count) > 0: print(f'Empty texts in {key}: {len(empty_count)}')
    dataset[key] = dataset[key][dataset[key]['text'] != '']

    # Removing duplicates (only if entire row is the same)
    duplicates = dataset[key][dataset[key].duplicated(keep='first')].sort_values('text')
    if len(duplicates) > 0: print(f'Duplicates in {key}: {len(duplicates)}\n')
    dataset[key] = dataset[key].drop_duplicates()
#     print(duplicates)

Duplicates in conan: 1282

Duplicates in mmhs150k: 78

NaNs in data_society: 216
Duplicates in data_society: 1574

Duplicates in twitter_kaggle: 2461



In [16]:
# Describing each used dataset (see related work for more details)

stop_words = stopwords.words('english')

def mostCommonWords(df):
    counter = Counter()
    df_lower = df['text'].str.lower()
    most_common = Counter(" ".join(df_lower).split()).most_common(100)
    most_common = [(word, count) for word, count in most_common if word not in stop_words]  
    return most_common
        
idx = 0
for key in dataset.keys():
    df = dataset[key]
    df_groupby = df.groupby(['target'])['text'].count()
    labels = df_groupby.keys()
    values = df_groupby.values
    
    fig = go.Figure(data=[go.Pie(labels=labels, 
                                 values=values, 
                                 textinfo='label+percent')])
    fig.update_layout(title_text=DATASET_FULLNAMES[key] + f' [{key}]')
    idx = idx + 1
    fig.show()

    print(df.describe())
    
    print('\n=== Most common words: \n')
    for value, count in mostCommonWords(df)[:10]:
        print(value, count)     

                                                     text   target
count                                                3721     3721
unique                                               3718        8
top     Migrants are not welcome here, they are not ev...  MUSLIMS
freq                                                    2      984

=== Most common words: 

muslims 432
women 410
islam 354
people 317
jews 315
want 261
migrants 210
immigrants 188
society. 181
country. 177


                                                     text  target
count                                                5647    5647
unique                                               5647       6
top     If America had another 8 years of Obama's ideo...  origin
freq                                                    1    2448

=== Most common words: 

@user 4418
@url 2326
retarded 514
retard 493
faggot 453
like 382
cunt 378
shithole 356
ching 333
fucking 319


                                                     text   target
count                                              149012   149012
unique                                             148973        6
top     Arab Spring's Legacy:  Islamist Gang Terror  #...  NotHate
freq                                                    4   116144

=== Most common words: 

nigga 69411
like 10656
cunt 10410
ass 5956
dyke 5946
twat 5307
got 5100
i’m 5051
fuck 4940
get 4612


                                                     text target
count                                               16200  16200
unique                                              16200      3
top     @truaemusic The followers of the religion give...   none
freq                                                    1  11114

=== Most common words: 

#mkr 4181
rt 3302
i'm 1259
like 920
kat 809
women 723
sexist 716
&amp; 615
get 612
people 572


                                                     text   target
count                                                6667     6667
unique                                               6667        4
top     @TVBachelor @NewJerzeyBoy what about the femin...  neither
freq                                                    1     5729

=== Most common words: 

rt 793
i'm 556
feminazi 425
like 354
#mkr 326
people 296
get 263
- 210
i've 202
one 197


                                                     text  target
count                                               18369   18369
unique                                              18305       4
top     @onedirection Favorite Duo or Group Pop/Rock a...  female
freq                                                    4    6581

=== Most common words: 

- 1023
i'm 972
get 955
like 952
one 709
new 659
love 641
go 531
_��� 522
people 494


            target
count  29499.00000
mean       0.06807
std        0.25187
min        0.00000
25%        0.00000
50%        0.00000
75%        0.00000
max        1.00000

=== Most common words: 

@user 16920
&amp; 1475
day 1400
#love 1317
happy 1280
- 1114
i'm 941
new 891
. 865
like 864


## Unified dataset

In [17]:
# Preserving dataset name
dataset['conan']['dataset_name'] = 'Fanton et al. (2021)'
dataset['mlma']['dataset_name'] = 'Ousidhoum et al. (2019)'
dataset['mmhs150k']['dataset_name'] = 'Gomez et al. (2019)'
# dataset['silva']['dataset_name'] = 'silva'
dataset['waseem']['dataset_name'] = 'Waseem (2016)'
dataset['waseem_hoovy']['dataset_name'] = 'Waseem and Hovy (2016)'
dataset['data_society']['dataset_name'] = 'Kaggle (2016)'
dataset['twitter_kaggle']['dataset_name'] = 'Analytics Vidhya (2019)'

# Combining datasets into one 
df_unified = pd.concat([
    dataset['conan'],
    dataset['mlma'],
    dataset['mmhs150k'],
    dataset['waseem'],
    dataset['waseem_hoovy'],
    dataset['data_society'],
    dataset['twitter_kaggle']
], ignore_index = True)

# Preserving original target category
df_unified['original_target'] = df_unified['target']

TRANSFORM_DICT = {
  # disability
  "DISABLED": "disability",
  "disability": "disability",
    
  # other
  "Homophobe": "deleted", # (deleted)
  "OtherHate": "deleted", # (deleted)
  "other": "deleted", # (deleted)
  "neither": "deleted", # (deleted)
    
  # religion
  "JEWS": "religion",
  "MUSLIMS": "religion",
  "Religion": "religion",
  "religion": "religion",
    
  # sexual_orientation
  "LGBT+": "sexual_orientation",
  "sexual_orientation": "sexual_orientation",
    
  # gender
  "Sexist": "gender",
  "WOMEN": "gender",
  "gender": "gender",
  "sexism": "gender",
    
  # origin
  "MIGRANTS": "origin",
  "POC": "origin",
  "Racist": "origin",
  "both": "origin",
  "origin": "origin",
  "racism": "origin",
    
  # non hate
  "Non Hate": "non hate",
  "NotHate": "deleted", # (deleted)
  "none": "non hate",
    
  # non hate (true)
  "brand": "non hate",
  "female": "non hate",
  "male": "non hate",
  "unknown": "non hate",
  "": "non hate",
  0: "non hate",
  1: "deleted", # (deleted - sexism/racism)
}

# Remap original target categories to new ones
df_unified = df_unified.replace({'target': TRANSFORM_DICT})

df_unified.describe()

Unnamed: 0,text,target,dataset_name,original_target
count,229115,229115,229115,229115
unique,224947,7,7,30
top,Arab Spring's Legacy: Islamist Gang Terror #...,deleted,Gomez et al. (2019),NotHate
freq,4,138006,149012,116144


In [18]:
# Mapping
df_unified_grouped = df_unified.groupby(['dataset_name', 'target', 'original_target']).count()
df_unified_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
dataset_name,target,original_target,Unnamed: 3_level_1
Analytics Vidhya (2019),deleted,1,2008
Analytics Vidhya (2019),non hate,0,27491
Fanton et al. (2021),deleted,other,181
Fanton et al. (2021),disability,DISABLED,175
Fanton et al. (2021),gender,WOMEN,560
Fanton et al. (2021),origin,MIGRANTS,637
Fanton et al. (2021),origin,POC,301
Fanton et al. (2021),religion,JEWS,418
Fanton et al. (2021),religion,MUSLIMS,984
Fanton et al. (2021),sexual_orientation,LGBT+,465


In [19]:
df_unified_grouped_table = df_unified_grouped.add_suffix('_Count').reset_index()


d_name = dict([(y,x) for x,y in enumerate(sorted(set(df_unified_grouped_table['dataset_name'])))])
df_unified_grouped_table['dn'] = [d_name[key] for key in df_unified_grouped_table['dataset_name']]

df_unified_grouped_table['otn'] = list(range(max(df_unified_grouped_table['dn']) + 1,
                                             max(df_unified_grouped_table['dn']) + 1 + 
                                             len(df_unified_grouped_table['original_target'])))

d_target = dict([(y,x + max(df_unified_grouped_table['otn']) + 1) 
                 for x,y in enumerate(sorted(set(df_unified_grouped_table['target'])))])
df_unified_grouped_table['t'] = [d_target[key] for key in df_unified_grouped_table['target']]

d_colors = {
    'deleted': '#EBEBEB', 
    'disability': '#08A4B1', 
    'gender': '#F5CBDD', 
    'non hate': '#CAE5C3', 
    'origin': '#FFB770', 
    'other': '#C3C7C9', 
    'religion': '#A5CDE8', 
    'sexual_orientation': '#EF517F'
}

sankey_labels = [*d_name.keys(), *df_unified_grouped_table['original_target'], *d_target.keys()],
sankey_source = [*df_unified_grouped_table.dn, *df_unified_grouped_table.otn]
sankey_target = [*df_unified_grouped_table.otn, *df_unified_grouped_table.t]
sankey_colors = []

df_unified_grouped_table['colors'] = [d_colors[key] for key in df_unified_grouped_table['target']]

for key in sankey_labels[0]:
    if key in TRANSFORM_DICT:
        sankey_colors.append(d_colors[TRANSFORM_DICT[key]])
    elif key in d_colors:
        sankey_colors.append(d_colors[key])
    else:
        sankey_colors.append("grey")
        
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      label =  sankey_labels[0],
      color = "#5985D0"
    ),
    link = dict(
      source =  sankey_source,
      target =  sankey_target,
      value =  [*df_unified_grouped_table['text_Count'], *df_unified_grouped_table['text_Count']],
      color =  [*df_unified_grouped_table['colors'], *df_unified_grouped_table['colors']]
        
))])

fig.update_layout(title_text="Unified dataset category transformation",
                  font_size=15, height=1000)
fig.show()

In [20]:
d_name = dict([(y,x) for x,y in enumerate(sorted(set(df_unified_grouped_table['dataset_name'])))])
df_unified_grouped_table['dn'] = [d_name[key] for key in df_unified_grouped_table['dataset_name']]
d_target = dict([(y,x + max(df_unified_grouped_table['dn']) + 1) 
                 for x,y in enumerate(sorted(set(df_unified_grouped_table['target'])))])
df_unified_grouped_table['t'] = [d_target[key] for key in df_unified_grouped_table['target']]
df_unified_grouped_table['colors'] = [d_colors[key] for key in df_unified_grouped_table['target']]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 30,
      thickness = 20,
      label =  [*d_name.keys(), *d_target.keys()],
      color = "#5985D0"
    ),
    link = dict(
      source =  df_unified_grouped_table.dn,
      target =  df_unified_grouped_table.t,
      value =  df_unified_grouped_table['text_Count'],
      color =  df_unified_grouped_table['colors']
        
))])

# fig.update_layout(title_text="Unified dataset category transformation",
#                   font_size=15, height=800)
fig.show()

In [21]:
# Dropping deleted categories: ['neither', 'Homophobe', 'OtherHate', 'other'] 
df_unified = df_unified[df_unified.target != 'deleted']

Dropping duplicate records with same text (keeping first) after combinining datasets into one 

In [22]:
duplicates = df_unified[df_unified.duplicated(['text'], keep='first')].sort_values('text')
dup_groupby = duplicates.groupby(['dataset_name'])['text'].count()
print(dup_groupby)

df_unified = df_unified.drop_duplicates(['text'], keep='first')

dataset_name
Fanton et al. (2021)        1
Gomez et al. (2019)         1
Kaggle (2016)              64
Waseem (2016)               1
Waseem and Hovy (2016)    171
Name: text, dtype: int64


## Final unified dataset

In [23]:
df_groupby = df_unified.groupby(['target'])['text'].count()

labels = df_groupby.keys()
values = df_groupby.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

df_unified.describe()

Unnamed: 0,text,target,dataset_name,original_target
count,90871,90871,90871,90871
unique,90871,6,7,24
top,‘Juice (Jews) are worse that nukes.’ Caption t...,non hate,Analytics Vidhya (2019),0
freq,1,56796,27491,27491


In [24]:
df_unified.to_csv('../datasets/unified_dataset.csv', index=False)