# Wikimedia Toxicity Datasets: Exploring the Data

In [40]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer

## Get the Data
Wulczyn, Ellery; Thain, Nithum; Dixon, Lucas (2016): Wikipedia Detox. figshare. [doi.org/10.6084/m9.figshare.4054689]

In [6]:
# Personal attacks corpora.
attacks_annoted_comms_df = pd.read_csv('../../data/Wikimedia-Toxicity-Personal-Attacks/attack_annotated_comments.tsv', 
                                  sep='\t', header=0)
attacks_annots_df = pd.read_csv('../../data/Wikimedia-Toxicity-Personal-Attacks/attack_annotations.tsv', 
                                  sep='\t', header=0)

# Aggresssion corpora.
aggression_annoted_comms_df = pd.read_csv('../../data/Wikimedia-Toxicity-Aggression/aggression_annotated_comments.tsv', 
                                  sep='\t', header=0)
aggression_annots_df = pd.read_csv('../../data/Wikimedia-Toxicity-Aggression/aggression_annotations.tsv', 
                                  sep='\t', header=0)

# Toxicity corpora.
toxicity__annoted_comms_df = pd.read_csv('../../data/Wikimedia-Toxicity-Toxicity/toxicity_annotated_comments.tsv', 
                                  sep='\t', header=0)
toxicity_annots_df = pd.read_csv('../../data/Wikimedia-Toxicity-Toxicity/toxicity_annotations.tsv', 
                                  sep='\t', header=0)

### Personal Attacks Corpora

In [7]:
# attacks_annoted_comms_df contains the comment text and metadata for comments 
# with attack/aggression/toxicity labels generated by crowd-workers.  
attacks_annoted_comms_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [8]:
# Personal attack labels from crowd-workers for each comment in 
# attack_annotated_comments.tsv. It can be joined with attack_annotated_comments.tsv 
# on rev_id. 
attacks_annots_df.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0


In [9]:
# 10,273 records labeled quoting_attack.
quoting_attacks = attacks_annots_df['quoting_attack'] == 1.0
attacks_annots_df[quoting_attacks].head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
229,290598,979,1.0,0.0,0.0,0.0,1.0
411,398923,1622,1.0,0.0,0.0,0.0,1.0
502,554005,306,1.0,0.0,0.0,0.0,1.0
515,569192,3784,1.0,1.0,1.0,0.0,1.0
852,849920,605,1.0,0.0,0.0,0.0,1.0


In [10]:
# 152607 records labeled recipient_attack.
recipient_attack = attacks_annots_df['recipient_attack'] == 1.0
attacks_annots_df[recipient_attack].head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
33,89320,3341,0.0,1.0,0.0,0.0,1.0
35,89320,3338,0.0,1.0,0.0,0.0,1.0
180,249432,3593,0.0,1.0,0.0,0.0,1.0
181,249432,2145,0.0,1.0,0.0,0.0,1.0
233,290598,2206,0.0,1.0,0.0,0.0,1.0


In [11]:
# 44,571 records labeled third_party_attack
third_party_attacks = attacks_annots_df['third_party_attack'] == 1.0
attacks_annots_df[third_party_attacks].head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
130,155243,449,0.0,0.0,1.0,0.0,1.0
231,290598,1082,0.0,0.0,1.0,0.0,1.0
515,569192,3784,1.0,1.0,1.0,0.0,1.0
590,622095,353,0.0,0.0,1.0,0.0,1.0
596,622095,1123,0.0,0.0,1.0,0.0,1.0


In [12]:
# 44,121 records labeled other_attack
other_attacks = attacks_annots_df['other_attack'] == 1.0
attacks_annots_df[other_attacks].head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
36,89320,2101,0.0,0.0,0.0,1.0,1.0
37,89320,673,0.0,0.0,0.0,1.0,1.0
127,155243,214,0.0,0.0,0.0,1.0,1.0
149,192579,2073,0.0,0.0,0.0,1.0,1.0
353,375422,269,0.0,0.0,0.0,1.0,1.0


In [13]:
# 227936 total records with an attack.
attacks = attacks_annots_df['attack'] == 1.0
attacks_annots_df[attacks].count()

rev_id                227936
worker_id             227936
quoting_attack        227936
recipient_attack      227936
third_party_attack    227936
other_attack          227936
attack                227936
dtype: int64

In [14]:
# 1137281 total records without an attack.
not_attacks = attacks_annots_df['attack'] == 0.0
attacks_annots_df[not_attacks].count()

rev_id                1137281
worker_id             1137281
quoting_attack        1137281
recipient_attack      1137281
third_party_attack    1137281
other_attack          1137281
attack                1137281
dtype: int64

In [15]:
# Balance of set:
attacks_annots_df[attacks].count() / attacks_annots_df[not_attacks].count()

rev_id                0.200422
worker_id             0.200422
quoting_attack        0.200422
recipient_attack      0.200422
third_party_attack    0.200422
other_attack          0.200422
attack                0.200422
dtype: float64

### Aggression Corpora

In [16]:
aggression_annoted_comms_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [17]:
aggression_annots_df.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score
0,37675,1362,1.0,-1.0
1,37675,2408,0.0,1.0
2,37675,1493,0.0,0.0
3,37675,1439,0.0,0.0
4,37675,170,0.0,0.0


In [18]:
# The range of values for aggression_score is -3.0 to 3.0 by 1.0.
aggression_annots_df.aggression_score.unique()

array([-1.,  1.,  0.,  2., -2.,  3., -3.])

### Toxicity Corpora

In [19]:
toxicity__annoted_comms_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,2232.0,This:NEWLINE_TOKEN:One can make an analogy in ...,2002,True,article,random,train
1,4216.0,`NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...,2002,True,user,random,train
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test
3,26547.0,`This is such a fun entry. DevotchkaNEWLINE_...,2002,True,article,random,train
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test


In [20]:
toxicity_annots_df.head()

Unnamed: 0,rev_id,worker_id,toxicity,toxicity_score
0,2232.0,723,0,0.0
1,2232.0,4000,0,0.0
2,2232.0,3989,0,1.0
3,2232.0,3341,0,0.0
4,2232.0,1574,0,1.0


In [21]:
# The range of values for toxicity_score is -2 to 2 by 1.
toxicity_annots_df.toxicity_score.unique()

array([ 0.,  1., -1.,  2., -2.])

# Prepare Attacks Data

In [22]:
# Merge the data frames on the rev_id.
attacks_merged = pd.merge(attacks_annoted_comms_df, attacks_annots_df, on='rev_id')

In [23]:
attacks_merged.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1362,0.0,0.0,0.0,0.0,0.0
1,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,2408,0.0,0.0,0.0,0.0,0.0
2,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1493,0.0,0.0,0.0,0.0,0.0
3,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1439,0.0,0.0,0.0,0.0,0.0
4,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,170,0.0,0.0,0.0,0.0,0.0


In [24]:
attacks_merged[attacks_merged['rev_id'] == 37675]

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1362,0.0,0.0,0.0,0.0,0.0
1,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,2408,0.0,0.0,0.0,0.0,0.0
2,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1493,0.0,0.0,0.0,0.0,0.0
3,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1439,0.0,0.0,0.0,0.0,0.0
4,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,170,0.0,0.0,0.0,0.0,0.0
5,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,176,0.0,0.0,0.0,0.0,0.0
6,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,481,0.0,0.0,0.0,0.0,0.0
7,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,487,0.0,0.0,0.0,0.0,0.0
8,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,578,0.0,0.0,0.0,0.0,0.0
9,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1127,0.0,0.0,0.0,0.0,0.0


In [25]:
# Each comment has been annotated 10x by 10 workers.
attacks_merged.worker_id.unique()

array([1362, 2408, 1493, ..., 4049, 1178, 4050])

In [26]:
# Group by rev_id and sum attack column.
# Since the presence of an attack is a 1, the annotations
# by the workers can be treated as votes, so a sum of the
# attack column greater than 5 is more than half of the 
# workers thought the comment contained a personal attack.
attacks_merged_summed = attacks_merged.groupby('rev_id').sum()

In [27]:
# Build set of rev_ids that contain personal attacks as labels.
attacks = attacks_merged_summed.loc[attacks_merged_summed['attack'] > 5].copy()
attacks.reset_index(level=0, inplace=True)
attacks['attack'] = 1
attacks.drop(['year', 'logged_in', 'worker_id', 'quoting_attack', 'recipient_attack', 
             'third_party_attack', 'other_attack'], axis=1, inplace=True)

In [28]:
# Build set of rev_ids that do not contain attacks.
no_attacks = attacks_merged_summed.loc[attacks_merged_summed['attack'] <= 5].copy()
no_attacks.reset_index(level=0, inplace=True)
no_attacks['attack'] = 0
no_attacks.drop(['year', 'logged_in', 'worker_id', 'quoting_attack', 'recipient_attack', 
             'third_party_attack', 'other_attack'], axis=1, inplace=True)

In [29]:
# Combine the the two sets and sort.
labels = attacks.append(no_attacks)
labels.sort_values(by=['rev_id'], inplace=True)
labels.reset_index(level=0, drop=True, inplace=True)

In [30]:
# Create features.
# groupby the rev_id, get only first of each group.
features = attacks_merged.groupby('rev_id').first().copy()

# Reset index, saving rev_id as column.
features.reset_index(level=0, inplace=True)

# Drop everything except for 'rev_id' and 'comment'.
features.drop(['year', 'logged_in', 'ns', 'sample', 'split', 'worker_id',
       'quoting_attack', 'recipient_attack', 'third_party_attack',
       'other_attack', 'attack'], axis=1, inplace=True)

features.head()

Unnamed: 0,rev_id,comment
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s..."
3,89320,"Next, maybe you could work on being less cond..."
4,93890,This page will need disambiguation.


In [31]:
# Merge with labels for complete set labeled data.
features = pd.merge(features, labels, on='rev_id').copy()
features.head()

Unnamed: 0,rev_id,comment,attack
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,0
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,0
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",0
3,89320,"Next, maybe you could work on being less cond...",0
4,93890,This page will need disambiguation.,0


In [32]:
# Number of comments with and without attacks.
num_attacks = len(features[features['attack'] == 1].index)
num_not_attacks = len(features[features['attack'] == 0].index)

print('Num of comments containing an attack: ', num_attacks)
print('Num of comments not containing an attack: ', num_not_attacks)

Num of comments containing an attack:  14205
Num of comments not containing an attack:  101659


In [33]:
# Dataset is pretty unbalanced, probably have to take steps to mitigate 
# problems stemming from an unbalanced set of data.
print('Balance of labels:', (num_attacks / num_not_attacks)*100, 'percent labeled as attack.')

Balance of labels: 13.973184863120824 percent labeled as attack.


In [34]:
# Find length of longest comment (in chars).
field_length = features.comment.map(len).max()
print(field_length)

18460


In [51]:
# TODO Strip out newline tokens, etc., before tokenizing.

# Find length of longest comment in words.
tokenized_comments = pd.DataFrame(features.comment.map(kpt.text_to_word_sequence))
tokenized_comments.head()

Unnamed: 0,comment
0,"[newline, tokenthis, is, not, creative, those,..."
1,"[newline, tokennewline, token, the, term, stan..."
2,"[newline, tokennewline, tokentrue, or, false, ..."
3,"[next, maybe, you, could, work, on, being, les..."
4,"[this, page, will, need, disambiguation]"


In [55]:
longest_comment_in_words = tokenized_comments.comment.map(len).max()
print('Longest comment (number of words):', longest_comment_in_words)

Longest comment (number of words): 2835
