In [44]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
from baselines import load_annotations, remove_na, tidy_labels, map_aggression_score_to_3class
import pandas as pd

#  Clean Raw Annotations

### Load raw  annotations

In [46]:
user_blocked = [
                'annotated_onion_layer_5_rows_0_to_5000_raters_20',     
                'annotated_onion_layer_5_rows_0_to_10000',             
                'annotated_onion_layer_5_rows_0_to_10000_raters_3',          
                'annotated_onion_layer_5_rows_10000_to_50526_raters_10',
                'annotated_onion_layer_10_rows_0_to_1000',              
                'annotated_onion_layer_20_rows_0_to_1000',              
                'annotated_onion_layer_30_rows_0_to_1000',              
]

user_random = [
            'annotated_random_data_rows_0_to_5000_raters_20',
            'annotated_random_data_rows_5000_to_10000',
            'annotated_random_data_rows_5000_to_10000_raters_3',
            'annotated_random_data_rows_10000_to_20000_raters_10',
]

article_blocked = ['article_onion_layer_5_all_rows_raters_10',]
article_random = ['article_random_data_all_rows_raters_10',]


files = {
    'user': {'blocked': user_blocked, 'random': user_random},
    'article': {'blocked': article_blocked, 'random': article_random}
}


dfs = []

for ns, d in files.items():
    for sample, files in  d.items():
        for f in files:
            df = pd.read_csv('../../data/annotations/raw/%s/%s.csv' % (ns,f))
            df['src'] = f
            df['ns'] = ns
            df['sample'] = sample
            dfs.append(df)
df = pd.concat(dfs)
print('# annotations: ', df.shape[0])

# annotations:  1406811


### Remove answers to test questions

In [47]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])

# annotations:  1406811


### Remove annotations where revision could not be read

In [48]:
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])

# annotations:  1398433


In [49]:
# remove all annotations where the annotator could not read the comment
df['na'].value_counts(dropna=False)

False    1384258
True       14175
Name: na, dtype: int64

In [50]:
df = df.query('na==False')
print('# annotations: ', df.shape[0])

# annotations:  1384258


### Remove all annotations with invalid aggression_score or is_harassment_or_attack input

In [51]:
df['aggression_score'].value_counts(dropna=False)

 0.0    980872
-1.0    138993
 1.0     87440
-3.0     72645
-2.0     64487
 2.0     27926
 3.0     11009
NaN        886
Name: aggression_score, dtype: int64

In [52]:
df['is_harassment_or_attack'].value_counts(dropna=False)

not_attack                                            1105010
recipient                                              148379
other                                                   38272
third_party                                             32501
recipient\nthird_party                                   9801
other\nnot_attack                                        8978
recipient\nnot_attack                                    6846
quoting                                                  6160
recipient\nthird_party\nquoting\nother\nnot_attack       5850
recipient\nother                                         4353
recipient\nthird_party\nquoting\nother                   2379
recipient\nthird_party\nnot_attack                       2298
third_party\nother                                       1822
recipient\nthird_party\nother                            1801
quoting\nnot_attack                                      1516
recipient\nthird_party\nquoting                          1515
third_pa

In [53]:
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
print('# annotations: ', df.shape[0])

# annotations:  1383327


### Tidy is_harassment_or_attack column

In [54]:
df = tidy_labels(df)

### Remap aggression score

In [55]:
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_3class)

### Discard nuisance columns

In [56]:
df.columns

Index(['_aggression_score', '_channel', '_city', '_country', '_created_at',
       '_golden', '_id', '_ip', '_is_harassment_or_attack', '_missed', '_na',
       '_region', '_started_at', '_tainted', '_trust', '_unit_id',
       '_worker_id', 'aggression_score', 'aggression_score_gold',
       'aggression_score_gold_reason', 'block_actions', 'block_params',
       'block_reasons', 'block_timestamps', 'clean_diff', 'diff',
       'insert_only', 'is_harassment_or_attack',
       'is_harassment_or_attack_gold', 'is_harassment_or_attack_gold_reason',
       'na', 'na_gold', 'na_gold_reason', 'ns', 'orig__golden', 'page_id',
       'page_title', 'rev_comment', 'rev_id', 'rev_timestamp', 'sample', 'src',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression'],
      dtype='object')

In [57]:
cols = ['rev_id', '_worker_id', 'ns', 'sample', 'src','clean_diff', 'diff', 'insert_only', 'page_id',
       'page_title', 'rev_comment', 'rev_timestamp', 
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression']

In [58]:
df = df[cols]

### Make sure that each rev was only annotated by the same worker once

In [59]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()

1    1381483
2        922
dtype: int64

In [60]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])

# annotations:  1382405


### Filter out annotations for revisions with  duplicated diff content

In [61]:
comments = df.drop_duplicates(subset = ['rev_id'])

In [62]:
comments.shape

(126318, 21)

In [63]:
u_comments = comments.drop_duplicates(subset = ['clean_diff'])

In [64]:
u_comments.shape

(122797, 21)

In [65]:
comments[comments.duplicated(subset = ['clean_diff'])].head()

Unnamed: 0,rev_id,_worker_id,ns,sample,src,clean_diff,diff,insert_only,page_id,page_title,...,rev_timestamp,user_id,user_text,not_attack,other,quoting,recipient,third_party,attack,aggression
4900,463400836,36261795,user,blocked,annotated_onion_layer_5_rows_0_to_5000_raters_20,Go back to school. \n\nDo us all a favour and...,NEWLINENEWLINE== Go back to school. ==NEWLINEN...,0,17796754,KorinoChikara,...,2011-12-01T01:56:18Z,,86.182.92.47,0.0,0.0,0.0,1.0,0.0,1.0,0
15480,174209550,33249231,user,blocked,annotated_onion_layer_5_rows_0_to_5000_raters_20,",\nBILLCJ's Mom, she's a big fat, fucking bitc...",",NEWLINEBILLCJ's Mom, she's a big fat, fucking...",0,14456793,FBillCJ,...,2007-11-27T20:41:19Z,5901637.0,FBillCJ,0.0,0.0,0.0,1.0,0.0,1.0,0
17080,416340929,15723827,user,blocked,annotated_onion_layer_5_rows_0_to_5000_raters_20,Please refrain from unmaking unconstructive ed...,NEWLINENEWLINEPlease refrain from unmaking unc...,0,17950293,Bongwarrior,...,2011-02-28T07:25:08Z,,72.19.141.132,1.0,0.0,0.0,0.0,0.0,0.0,1
18260,375652060,15577915,user,blocked,annotated_onion_layer_5_rows_0_to_5000_raters_20,(unless they relate to my hatred of Arabs),(unless they relate to my hatred of Arabs),0,18242559,Jayjg/Archive 38,...,2010-07-27T01:17:08Z,,58.8.11.75,0.0,0.0,0.0,0.0,1.0,1.0,0
20480,553765001,36374950,user,blocked,annotated_onion_layer_5_rows_0_to_5000_raters_20,"Complaint about Biruitorul \n\nGreetings, Mat...",NEWLINENEWLINE== Complaint about Biruitorul ==...,0,9870625,Jimbo Wales,...,2013-05-06T11:40:42Z,18959131.0,Minceeed,1.0,0.0,0.0,0.0,0.0,0.0,1


In [66]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])

# annotations:  1346904


### Check that labels are not None

In [67]:
df['recipient'].value_counts(dropna=False)

0.0    1170677
1.0     176227
Name: recipient, dtype: int64

In [68]:
df['attack'].value_counts(dropna=False)

0.0    1081700
1.0     265204
Name: attack, dtype: int64

In [69]:
df['aggression'].value_counts(dropna=False)

1    960754
0    262716
2    123434
Name: aggression, dtype: int64

### Remove annotations from all revisions that were annotated less than 8 times

In [70]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index

In [71]:
counts.shape

(122797, 2)

In [72]:
counts['n'].value_counts()

10    100865
9       6647
20      4465
30      3690
7       3034
29       904
8        856
6        475
19       416
4        224
2        219
5        206
3        199
1        184
28       149
18        52
31        46
27        42
17        20
14        13
16        12
26        12
25        11
15         9
40         8
13         7
23         7
11         5
24         5
21         4
12         4
38         3
22         2
39         1
50         1
Name: n, dtype: int64

In [73]:
counts_enough = counts.query("n>=8")

In [74]:
counts_enough.shape

(118256, 2)

In [75]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])

# annotations:  1319671


### Summary Stats

In [76]:
df.groupby(['ns', 'sample']).size()

ns       sample 
article  blocked    316377
         random     196748
user     blocked    565576
         random     240970
dtype: int64

In [87]:
df.to_csv('../../data/annotations/clean/annotations.tsv', index=False, sep='\t')

In [88]:
pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t').shape

(1319671, 21)