# Data Analysis of Wikimedia Personal Attacks

In [1]:
# Wikimedia Toxicity Personal Attacks Multilabel Data Prep (including rev_id).
import sys
import os
from pathlib import Path
import getopt
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import msgpack
import re
import csv
from gensim.models import Word2Vec
from io import BytesIO
from tensorflow.python.lib.io import file_io

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
# Set paths.

# Default data directory.
data_dir = ''

# Default number of votes to decide label based on the annotations of the
# ten workers who annotated the dataset.
min_num_votes = 6

# Path to data directory.
data_path = Path(data_dir)

# Output path.
output_dir = Path('output')

In [3]:
# Get the data, create dataframes from the tab-separated files.
attacks_comments_path = data_path / 'attack_annotated_comments.tsv'
attacks_comments_df = pd.read_csv(attacks_comments_path, sep='\t', header=0)

attacks_labels_path = data_path / 'attack_annotations.tsv'
attacks_labels_df = pd.read_csv(attacks_labels_path, sep='\t', header=0)

In [4]:
attacks_comments_df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [5]:
len(attacks_comments_df)

115864

In [6]:
# Merge data frames of comments and annotations on rev_id.
attacks_merged = pd.merge(attacks_comments_df, attacks_labels_df, 
                          on='rev_id')
attacks_merged.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1362,0.0,0.0,0.0,0.0,0.0
1,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,2408,0.0,0.0,0.0,0.0,0.0
2,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1493,0.0,0.0,0.0,0.0,0.0
3,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,1439,0.0,0.0,0.0,0.0,0.0
4,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,170,0.0,0.0,0.0,0.0,0.0


In [7]:
# Group by rev_id, sum annotations.
attacks_merged_summed = attacks_merged.groupby('rev_id').sum()
attacks_merged_summed.drop(['year', 'logged_in', 'worker_id'], axis=1, inplace=True)
attacks_merged_summed.head()

Unnamed: 0_level_0,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37675,0.0,0.0,0.0,0.0,0.0
44816,0.0,0.0,0.0,0.0,0.0
49851,0.0,0.0,0.0,0.0,0.0
89320,0.0,2.0,0.0,2.0,4.0
93890,0.0,0.0,0.0,0.0,0.0


In [8]:
# Cast floats to ints to avoid later processing errors in TF/Keras.
attacks_merged_summed.quoting_attack = pd.to_numeric(attacks_merged_summed.quoting_attack, 
              downcast='integer')
attacks_merged_summed.recipient_attack = pd.to_numeric(attacks_merged_summed.recipient_attack, 
              downcast='integer')
attacks_merged_summed.third_party_attack = pd.to_numeric(attacks_merged_summed.third_party_attack, 
              downcast='integer')
attacks_merged_summed.other_attack = pd.to_numeric(attacks_merged_summed.other_attack, 
              downcast='integer')
attacks_merged_summed.attack = pd.to_numeric(attacks_merged_summed.attack, 
              downcast='integer')
attacks_merged_summed.head()

Unnamed: 0_level_0,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37675,0,0,0,0,0
44816,0,0,0,0,0
49851,0,0,0,0,0
89320,0,2,0,2,4
93890,0,0,0,0,0


In [29]:
# Distributions of labels at range of thresholds.
# Despite using the same threshold as the binary version of this dataset, 5, reducing the number of
# less-frequent labels quite low, it's retained for a better comparison between the models. 
thresholds_df = pd.DataFrame(columns = ['quoting_attack' , 'recipient_attack', 
                                        'third_party_attack' , 'other_attack',
                                        'attack'])

thresholds = range(1, 11)
for threshold in thresholds:
    print('Threshhold = ', threshold)
    
    num_quoting_attack = len(attacks_merged_summed[attacks_merged_summed['quoting_attack'] > threshold])
    num_recipient_attack = len(attacks_merged_summed[attacks_merged_summed['recipient_attack'] > threshold])
    num_third_party_attack = len(attacks_merged_summed[attacks_merged_summed['third_party_attack'] > threshold])
    num_other_attack = len(attacks_merged_summed[attacks_merged_summed['other_attack'] > threshold])
    num_attack = len(attacks_merged_summed[attacks_merged_summed['attack'] > threshold])
    
    print('Num quoting_attack:\t', num_quoting_attack)
    print('Num recipient_attack:\t', num_recipient_attack)
    print('Num third_party_attack:\t', num_third_party_attack)
    print('Num other_attack:\t', num_other_attack)
    print('Num attack:\t\t', num_attack)
    print()
    
    row = [num_quoting_attack, num_recipient_attack, 
           num_third_party_attack, num_other_attack,
           num_attack]
    
    thresholds_df = thresholds_df.append(pd.Series(row, index=thresholds_df.columns), ignore_index=True)

Threshhold =  1
Num quoting_attack:	 937
Num recipient_attack:	 26691
Num third_party_attack:	 8931
Num other_attack:	 9291
Num attack:		 37755

Threshhold =  2
Num quoting_attack:	 154
Num recipient_attack:	 18700
Num third_party_attack:	 4657
Num other_attack:	 3341
Num attack:		 27608

Threshhold =  3
Num quoting_attack:	 49
Num recipient_attack:	 14306
Num third_party_attack:	 2692
Num other_attack:	 1370
Num attack:		 21465

Threshhold =  4
Num quoting_attack:	 17
Num recipient_attack:	 11296
Num third_party_attack:	 1651
Num other_attack:	 599
Num attack:		 17332

Threshhold =  5
Num quoting_attack:	 7
Num recipient_attack:	 9053
Num third_party_attack:	 1042
Num other_attack:	 285
Num attack:		 14205

Threshhold =  6
Num quoting_attack:	 4
Num recipient_attack:	 7096
Num third_party_attack:	 635
Num other_attack:	 152
Num attack:		 11559

Threshhold =  7
Num quoting_attack:	 2
Num recipient_attack:	 5288
Num third_party_attack:	 452
Num other_attack:	 96
Num attack:		 9120

Thre