In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class
import pandas as pd



#  Clean Raw Annotations

### Load raw  annotations

In [3]:
"""
# v4_annotated
user_blocked = [
                'annotated_onion_layer_5_rows_0_to_5000_raters_20',     
                'annotated_onion_layer_5_rows_0_to_10000',             
                'annotated_onion_layer_5_rows_0_to_10000_raters_3',          
                'annotated_onion_layer_5_rows_10000_to_50526_raters_10',
                'annotated_onion_layer_10_rows_0_to_1000',              
                'annotated_onion_layer_20_rows_0_to_1000',              
                'annotated_onion_layer_30_rows_0_to_1000',              
]

user_random = [
            'annotated_random_data_rows_0_to_5000_raters_20',
            'annotated_random_data_rows_5000_to_10000',
            'annotated_random_data_rows_5000_to_10000_raters_3',
            'annotated_random_data_rows_10000_to_20000_raters_10',
]

article_blocked = ['article_onion_layer_5_all_rows_raters_10',]
article_random = ['article_random_data_all_rows_raters_10',]
"""

user_blocked = [
            'user_blocked',
            'user_blocked_2',
            'user_blocked_3',
            'user_blocked_4',
            'user_blocked_layer_10',
            'user_blocked_layer_20',
            'user_blocked_layer_30',
]

user_random = [
            'user_random',
            'user_random_2',
            'user_random_3',
            'user_random_4',
            'user_random_extra_baselines',

]

article_blocked = [ 'article_blocked',
                    'article_blocked_layer_5_extra_baselines' ]


article_random = ['article_random',
                  'article_random_extra_baselines']



files = {
    'user': {'blocked': user_blocked, 'random': user_random},
    'article': {'blocked': article_blocked, 'random': article_random}
}


dfs = []

for ns, d in files.items():
    for sample, files in  d.items():
        for f in files:
            df = pd.read_csv('../../data/annotations/raw/%s/%s.csv' % (ns,f))
            df['src'] = f
            df['ns'] = ns
            df['sample'] = sample
            dfs.append(df)
df = pd.concat(dfs)
print('# annotations: ', df.shape[0])

# annotations:  1524236


### Tidy is_harassment_or_attack column

In [4]:
df = tidy_labels(df)

### Remap aggression score

In [5]:
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)

### Remove answers to test questions

In [6]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])

# annotations:  1524236


### Remove annotations where revision could not be read

In [7]:
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])

# annotations:  1515797


In [8]:
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])

# annotations:  1501362


### Examine aggression_score or is_harassment_or_attack input

In [9]:
df['aggression_score'].value_counts(dropna=False)

 0.0    1081946
-1.0     144082
 1.0      92396
-3.0      74164
-2.0      66157
 2.0      29830
 3.0      11896
NaN         891
Name: aggression_score, dtype: int64

In [10]:
df['is_harassment_or_attack'].value_counts(dropna=False)

not_attack                                            1213638
recipient                                              150883
other                                                   40456
third_party                                             33542
recipient\nthird_party                                  10063
other\nnot_attack                                        9290
recipient\nnot_attack                                    6964
quoting                                                  6585
recipient\nthird_party\nquoting\nother\nnot_attack       5980
recipient\nother                                         4404
recipient\nthird_party\nquoting\nother                   2513
recipient\nthird_party\nnot_attack                       2497
third_party\nother                                       1900
recipient\nthird_party\nother                            1861
quoting\nnot_attack                                      1624
recipient\nthird_party\nquoting                          1611
recipien

### Drop NAs in aggression_score or is_harassment_or_attack input

In [11]:
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
print('# annotations: ', df.shape[0])

# annotations:  1500430


### Remove ambivalent is_harassment_or_attack annotations

An annotations is ambivalent if it was labeled as both an attack and not an attack

In [12]:
# remove all annotations from users who are ambivalent in 10% or more of revisions
# we consider these users unreliable
def ambivalent(s):
    return 'not_attack' in s and s!= 'not_attack'
df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
print('# annotations: ', df.shape[0])

# annotations:  1439009


In [13]:
# remove all other ambivalent annotations
df = df.query('ambivalent==False')
print('# annotations: ', df.shape[0])

# annotations:  1434118


### Make sure that each rev was only annotated by the same worker once

In [14]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()

1    1431360
2       1379
dtype: int64

In [15]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])

# annotations:  1432739


### Filter out annotations for revisions with  duplicated diff content

In [16]:
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])

124591


In [17]:
u_comments = comments.drop_duplicates(subset = ['clean_diff'])
print(u_comments.shape[0])

121110


In [18]:
comments[comments.duplicated(subset = ['clean_diff'])].head(5)

Unnamed: 0,_aggression_score,_channel,_city,_country,_created_at,_golden,_id,_ip,_is_harassment_or_attack,_missed,...,user_id,user_text,not_attack,other,quoting,recipient,third_party,attack,aggression,ambivalent
1915,,elite,Ranaghat,IND,5/25/2016 17:25:12,False,1999559134,202.142.114.58,,,...,20239661.0,Blobr186,1.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3714,,tremorgames,Paris,FRA,5/8/2016 14:10:51,False,1979134762,82.234.241.249,,,...,,194.144.111.210,1.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3884,,tremorgames,Paris,FRA,4/15/2016 13:57:26,False,1960392600,82.234.241.249,,,...,7116643.0,Artimes623,0.0,0.0,0.0,1.0,0.0,1.0,1.0,False
4220,,neodev,Maracay,VEN,6/13/2016 19:32:33,False,2020027603,186.92.252.244,,,...,8774126.0,Sirgreene,0.0,0.0,0.0,1.0,0.0,1.0,1.0,False
4407,,neodev,Barquisimeto,VEN,8/5/2016 02:11:27,False,2075857609,186.93.179.64,,,...,,32.172.151.109,1.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [19]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])

# annotations:  1395949


### Check that labels are not None

In [20]:
df['recipient'].value_counts(dropna=False)

0.0    1240803
1.0     155146
Name: recipient, dtype: int64

In [21]:
df['attack'].value_counts(dropna=False)

0.0    1163898
1.0     232051
Name: attack, dtype: int64

In [22]:
df['aggression'].value_counts(dropna=False)

0.0    1141424
1.0     254525
Name: aggression, dtype: int64

### Remove annotations from all revisions that were annotated less than 8 times

In [23]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index

In [24]:
counts.shape

(121110, 2)

In [25]:
counts['n'].value_counts().head()

10    55156
9     29478
8      7856
19     6905
20     6197
Name: n, dtype: int64

In [26]:
counts_enough = counts.query("n>=8")

In [27]:
counts_enough.shape

(115737, 2)

In [28]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])

# annotations:  1363735


### Discard nuisance columns

In [29]:
df.columns

Index(['_aggression_score', '_channel', '_city', '_country', '_created_at',
       '_golden', '_id', '_ip', '_is_harassment_or_attack', '_missed', '_na',
       '_region', '_started_at', '_tainted', '_trust', '_unit_id',
       '_worker_id', 'aggression_score', 'aggression_score_gold',
       'aggression_score_gold_reason', 'block_actions', 'block_params',
       'block_reasons', 'block_timestamps', 'clean_diff', 'diff',
       'insert_only', 'is_harassment_or_attack',
       'is_harassment_or_attack_gold', 'is_harassment_or_attack_gold_reason',
       'na', 'na_gold', 'na_gold_reason', 'ns', 'orig__golden', 'page_id',
       'page_title', 'rev_comment', 'rev_id', 'rev_timestamp', 'sample', 'src',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression', 'ambivalent'],
      dtype='object')

In [30]:
cols = ['rev_id', '_worker_id', 'ns', 'sample', 'src','clean_diff', 'diff', 'insert_only', 'page_id',
       'page_title', 'rev_comment', 'rev_timestamp', 
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression']
df = df[cols]

### Summary Stats

In [31]:
df.groupby(['ns', 'sample']).size()

ns       sample 
article  blocked    346832
         random     232818
user     blocked    533886
         random     250199
dtype: int64

In [32]:
df.to_csv('../../data/annotations/clean/annotations.tsv', index=False, sep='\t')

In [33]:
pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t').shape

(1363735, 21)