In [3]:
import json
import pickle
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [18]:
data_dir = "./severity_dumps/"
filenames = os.listdir(data_dir)
filenames

['ta_joe_severity_r4',
 'ta_negin_severity_r5',
 'ta_sanjna_severity_r4_review',
 'ta_joe_severity_r1',
 'ta_xiaoyan_severity_r6',
 'ta_jillian_severity_r2',
 'ta_negin_severity_r4_review',
 'ta_xiaoyan_severity_r6_review',
 'ta_joe_severity_r2',
 'ta_xiaoyan_severity_r3',
 'ta_xiaoyan_severity_r5',
 'ta_sanjna_severity_r1',
 'ta_nour_severity_r1',
 'ta_joe_target_severity_r3',
 'ta_sanjna_severity_r3',
 'ta_sanjna_severity_r6_review',
 'ta_sanjna_severity_r2',
 'ta_mcgill_severity_r4_review',
 'ta_sanjna_severity_r4',
 'ta_nour_severity_r2',
 'ta_negin_severity_r1',
 'ta_sanjna_severity_r5',
 'ta_jillian_severity_r1',
 'ta_sanjna_severity_r6',
 'ta_negin_severity_r3',
 'ta_xiaoyan_severity_r1',
 'ta_joe_severity_training_r1',
 'ta_xiaoyan_severity_r4_review',
 'ta_xiaoyan_severity_r4',
 'ta_xiaoyan_severity_r2',
 'ta_joe_severity_r3',
 'ta_negin_severity_r4',
 'ta_negin_severity_r2']

#### Loading annotations and cleaning unwanted columns

In [11]:
def load_ds_by_keyword(data_dir, keyword, cols=['text', 'meta', 'label', 'accept', 'answer']):
    filenames = os.listdir(data_dir)
    filtered_fns = [name for name in filenames if keyword in name]
    datasets = {}
    for filename in filtered_fns:
        ds = pd.DataFrame(pickle.load(open(os.path.join(data_dir, filename), "rb")))
        datasets[filename] = ds
#         print(ds.columns)
#         print("{} has {} annotations".format(filename, len(datasets[filename])))
    return datasets

In [71]:
# Cleaning annotations
round_data_dict = {}
for i in tqdm(range(1, 7)):
    round_title = "r"+str(i)
    round_data_dict[round_title] = load_ds_by_keyword(data_dir, round_title)
    keys_to_pop = []
    for annotator_file in round_data_dict[round_title].keys():
#         print(annotator_file)
    
        if len(annotator_file.split("_"))>4:
            print("*"*20)
            print("skipping {}".format(annotator_file))
            print("*"*20)
            keys_to_pop.append(annotator_file)
            continue
        
        curr_df = round_data_dict[round_title][annotator_file] 
        if "meta" not in curr_df.columns:
            print("*"*20)
            print("{} has no meta but has {} records".format(annotator_file, len(curr_df)))
            print("*"*20)
            keys_to_pop.append(annotator_file)
            continue
        tweet_ids = [item["tweet_id"] for item in curr_df["meta"]]
        curr_df["tweet_id"] = tweet_ids
        annotator = annotator_file.split("_")[1]
        curr_df["annotator"] = [annotator]*len(curr_df)
        curr_df["round"] = [round_title]*len(curr_df)

        round_data_dict[round_title][annotator_file] = curr_df[["tweet_id", "annotator", "round", "text", "label", "accept"]]
    for key in keys_to_pop:
        round_data_dict[round_title].pop(key)

100%|██████████| 6/6 [00:00<00:00, 25.72it/s]

********************
skipping ta_joe_severity_training_r1
********************
********************
skipping ta_joe_target_severity_r3
********************
********************
ta_joe_severity_r3 has no meta but has 0 records
********************
********************
skipping ta_sanjna_severity_r4_review
********************
********************
skipping ta_negin_severity_r4_review
********************
********************
skipping ta_mcgill_severity_r4_review
********************
********************
skipping ta_xiaoyan_severity_r4_review
********************
********************
ta_negin_severity_r5 has no meta but has 0 records
********************
********************
ta_xiaoyan_severity_r5 has no meta but has 0 records
********************
********************
ta_sanjna_severity_r5 has no meta but has 0 records
********************
********************
skipping ta_xiaoyan_severity_r6_review
********************
********************
skipping ta_sanjna_severity_r6_review
***********




#### Merging all rounds into a single dataframe

In [105]:
round_data_dict["r1"]["ta_xiaoyan_severity_r1"]
# get all in nested lists
data_dfs = [[round_data_dict[r][f] for f in round_data_dict[r]] for r in round_data_dict]
# flatten
data_dfs = [val for sublist in data_dfs for val in sublist]
annotations_df = pd.concat(data_dfs)
annotations_df

Unnamed: 0,tweet_id,annotator,round,text,label,accept
0,925856978475171841,joe,r1,Here we go again. Yet another LIBERAL accused ...,1,[6]
1,930292240416731137,joe,r1,I remember when everyone was dickriding -JOHN ...,1,"[1, 7]"
2,918439998898294784,joe,r1,"I'm a Matt Damon fan, can't even believe he wa...",1,"[1, 2]"
3,921475052125614080,joe,r1,Darn it. I was hoping a generation of tech emp...,1,"[1, 3]"
4,929109853267746818,joe,r1,"Moore, <OTHER TARGET 2>, -JOHN DOE-, Mueller, ...",1,"[1, 4]"
...,...,...,...,...,...,...
286,928886595159232512,sanjna,r6,Not -JOHN DOE- too! I'm not having it.,1,[1]
287,928633094600568837,sanjna,r6,Feel dead guilty for naming our seagull after ...,1,[3]
288,929020088321110017,sanjna,r6,Everybody knows about -JOHN DOE- and he treats...,1,[1]
289,928886595159232512,sanjna,r6,Not -JOHN DOE- too! I'm not having it.,1,[1]


#### mean vote

In [117]:
rows = []
for idx, group in  annotations_df.groupby("tweet_id"):
    row = {}
    row["tweet_id"] = idx
    row["round"] = group["round"].unique()[0]
    row["text"] = list(group["text"])[0]
    
    print(idx)
    print(group["accept"])
    ratings = []
    for accept in group["accept"]:
        if len(accept)>1:
            if "hard_to_parse" in accept:
                accept.remove("hard_to_parse")
            if accept[0] == 1 and len(accept)>1:
                rating =  int(accept[1])
            elif len(accept) == 1:
                rating = int(accept[0])
            else:
                continue
        
        else:
            if "hard_to_parse" not in accept and len(accept)>=1:
                rating = int(accept[0])
        ratings.append(rating)
    print(ratings)
    avg_rating = round(np.average(ratings))
    row["rounded_avg_severity"] = avg_rating
    print("avg = {}".format(avg_rating))
    rows.append(row)

916015738175082496
162    [3]
167    [7]
172    [6]
Name: accept, dtype: object
[3, 7, 6]
avg = 5
916025460156518401
646    [5]
651    [5]
669    [4]
Name: accept, dtype: object
[5, 5, 4]
avg = 5
916037936516337665
144    [1]
149    [7]
154    [6]
Name: accept, dtype: object
[1, 7, 6]
avg = 5
916044166596935680
651    [7]
656    [8]
674    [7]
Name: accept, dtype: object
[7, 8, 7]
avg = 7
916055010240270337
140    [6]
145    [7]
150    [7]
Name: accept, dtype: object
[6, 7, 7]
avg = 7
916061337561849857
249    [5]
24     [5]
29     [4]
Name: accept, dtype: object
[5, 5, 4]
avg = 5
916092718979469312
649    [6]
654    [6]
672    [5]
Name: accept, dtype: object
[6, 6, 5]
avg = 6
916125672547942400
638    [4]
643    [4]
661    [5]
Name: accept, dtype: object
[4, 4, 5]
avg = 4
916320296738721792
143    [5]
148    [8]
153    [6]
Name: accept, dtype: object
[5, 8, 6]
avg = 6
916340122420903936
110    [1, 7]
100    [1, 6]
110    [1, 7]
110    [1, 7]
44     [1, 7]
110    [1, 5]
Name: accept, d

Name: accept, dtype: object
[3, 7, 2]
avg = 4
921164633406353408
514    [4]
524    [2]
544    [2]
Name: accept, dtype: object
[4, 2, 2]
avg = 3
921173867908956160
509    [1]
519    [2]
539    [3]
Name: accept, dtype: object
[1, 2, 3]
avg = 2
921181027451543557
193    [4]
223    [3]
Name: accept, dtype: object
[4, 3]
avg = 4
921183431379378176
21    [1, 7]
22    [1, 7]
11    [1, 2]
12    [1, 2]
21    [1, 7]
22    [1, 7]
21    [1, 6]
22    [1, 6]
21    [1, 5]
22    [1, 5]
7     [1, 4]
8     [1, 4]
Name: accept, dtype: object
[7, 7, 2, 2, 7, 7, 6, 6, 5, 5, 4, 4]
avg = 5
921189169786888192
515    [7]
525    [7]
545    [7]
Name: accept, dtype: object
[7, 7, 7]
avg = 7
921197902013386752
55    [1, 3]
90    [1, 6]
Name: accept, dtype: object
[3, 6]
avg = 4
921199240092835840
195    [3]
200    [4]
205    [2]
Name: accept, dtype: object
[3, 4, 2]
avg = 3
921217436757630976
72    [1, 5]
62       [2]
72    [1, 6]
72    [1, 5]
23    [1, 2]
72    [1, 7]
Name: accept, dtype: object
[5, 2, 6, 5, 2, 7

Name: accept, dtype: object
[1, 1, 1, 1, 5, 5, 4, 4, 4, 4]
avg = 3
923655018389164034
198    [1, 1]
198       [1]
198    [1, 1]
Name: accept, dtype: object
[1, 1, 1]
avg = 1
923658850909261824
54    [3]
68    [2]
Name: accept, dtype: object
[3, 2]
avg = 2
923672531822956545
16    [5]
16    [5]
16    [4]
16    [5]
Name: accept, dtype: object
[5, 5, 4, 5]
avg = 5
923673876760092672
314    [7]
334    [6]
332    [5]
Name: accept, dtype: object
[7, 6, 5]
avg = 6
923675616938835968
64    [7]
78    [7]
Name: accept, dtype: object
[7, 7]
avg = 7
923676472904179712
307    [4]
327    [4]
325    [5]
Name: accept, dtype: object
[4, 4, 5]
avg = 4
923690604361424896
310    [6]
330    [6]
328    [6]
Name: accept, dtype: object
[6, 6, 6]
avg = 6
923695318121693184
55    [4]
69    [5]
Name: accept, dtype: object
[4, 5]
avg = 4
923698551972020225
14    [1, 7]
15    [1, 7]
4     [1, 2]
5     [1, 2]
14    [1, 4]
15    [1, 4]
14    [1, 5]
15    [1, 5]
14    [1, 4]
15    [1, 4]
Name: accept, dtype: object
[

Name: accept, dtype: object
[3, 6, 4]
avg = 4
925710322987618304
704    [3]
705    [5]
727    [3]
Name: accept, dtype: object
[3, 5, 3]
avg = 4
925712028509130752
67    [1, 5]
57    [1, 4]
67    [1, 4]
67    [1, 3]
67    [1, 2]
Name: accept, dtype: object
[5, 4, 4, 3, 2]
avg = 4
925712508542947329
91    [1, 6]
81    [1, 7]
91    [1, 7]
91    [1, 6]
91    [1, 7]
63    [1, 6]
Name: accept, dtype: object
[6, 7, 7, 6, 7, 6]
avg = 6
925715902741565440
709    [4]
710    [6]
732    [5]
Name: accept, dtype: object
[4, 6, 5]
avg = 5
925717360387985408
692    [1]
693    [3]
715    [6]
Name: accept, dtype: object
[1, 3, 6]
avg = 3
925719745629863938
217    [6]
222    [7]
227    [7]
Name: accept, dtype: object
[6, 7, 7]
avg = 7
925722586993926144
205    [3]
210    [7]
215    [6]
Name: accept, dtype: object
[3, 7, 6]
avg = 5
925724603170902016
73    [1, 7]
63    [1, 6]
73    [1, 9]
73    [1, 7]
73    [1, 7]
Name: accept, dtype: object
[7, 6, 9, 7, 7]
avg = 7
925725608704868354
701    [5]
702    [5]

Name: accept, dtype: object
[3, 4, 6]
avg = 4
926421408502206464
11    [1, 4]
11    [1, 2]
11    [1, 1]
11    [1, 4]
Name: accept, dtype: object
[4, 2, 1, 4]
avg = 3
926445468154032129
61    [6]
75    [7]
Name: accept, dtype: object
[6, 7]
avg = 6
926451002923986949
71    [1, 6]
61    [1, 4]
71    [1, 7]
71    [1, 8]
71    [1, 8]
Name: accept, dtype: object
[6, 4, 7, 8, 8]
avg = 7
926454374364336129
88    [1, 0]
78    [1, 9]
88    [1, 9]
88    [1, 8]
32    [1, 9]
88    [1, 6]
Name: accept, dtype: object
[0, 9, 9, 8, 9, 6]
avg = 7
926456258206543872
713    [2]
714    [3]
736    [2]
Name: accept, dtype: object
[2, 3, 2]
avg = 2
926463892506726400
335    [7]
355    [4]
353    [4]
Name: accept, dtype: object
[7, 4, 4]
avg = 5
926464484889243649
5                   [2]
5    [1, hard_to_parse]
5                   [1]
5                   [2]
Name: accept, dtype: object
[2, 1, 1, 2]
avg = 2
926465403970351105
87    [1, 6]
Name: accept, dtype: object
[6]
avg = 6
926471382598410241
342    [6]
72

Name: accept, dtype: object
[6, 2]
avg = 4
928623936123883521
275    [7]
269    [3]
Name: accept, dtype: object
[7, 3]
avg = 5
928631530024468480
479                [3]
482                [5]
497    [hard_to_parse]
Name: accept, dtype: object
[3, 5, 5]
avg = 4
928631891493834758
274    [7]
268    [1]
Name: accept, dtype: object
[7, 1]
avg = 4
928632460350492672
277    [3]
276    [4]
Name: accept, dtype: object
[3, 4]
avg = 4
928633094600568837
248    [5]
288    [4]
287    [3]
290    [3]
Name: accept, dtype: object
[5, 4, 3, 3]
avg = 4
928633741634887685
759    [6]
518    [5]
538    [6]
Name: accept, dtype: object
[6, 5, 6]
avg = 6
928633915564314625
101    [1, 8]
91     [1, 7]
101    [1, 7]
101    [1, 7]
35     [1, 8]
101    [1, 7]
Name: accept, dtype: object
[8, 7, 7, 7, 8, 7]
avg = 7
928652951152877569
97    [1, 2]
Name: accept, dtype: object
[2]
avg = 2
928660067980857344
146    [1, 7]
136    [1, 5]
146    [1, 7]
146    [1, 7]
68     [1, 7]
146    [1, 4]
Name: accept, dtype: object


929017760205635585
37    [3]
51    [6]
Name: accept, dtype: object
[3, 6]
avg = 4
929018528933597185
140    [7]
154    [4]
Name: accept, dtype: object
[7, 4]
avg = 6
929020088321110017
280    [5]
279    [2]
288    [1]
Name: accept, dtype: object
[5, 2, 1]
avg = 3
929020750320738304
165    [1, 4]
155    [1, 6]
165    [1, 6]
165    [1, 8]
165    [1, 7]
Name: accept, dtype: object
[4, 6, 6, 8, 7]
avg = 6
929023716067020804
56    [1, 9]
46    [1, 9]
56    [1, 0]
56    [1, 9]
57    [1, 9]
37    [1, 9]
Name: accept, dtype: object
[9, 9, 0, 9, 9, 9]
avg = 8
929027260669222913
29    [1, 7]
19    [1, 4]
29    [1, 7]
29    [1, 7]
29    [1, 6]
Name: accept, dtype: object
[7, 4, 7, 7, 6]
avg = 6
929029455082938368
56                   [1, 6]
46                   [1, 7]
56    [1, 5, hard_to_parse]
56                   [1, 4]
56                   [1, 8]
Name: accept, dtype: object
[6, 7, 5, 4, 8]
avg = 6
929029797438742533
9     [8]
19    [5]
Name: accept, dtype: object
[8, 5]
avg = 6
92903455853160

929261727891406848
90     [4]
95     [5]
100    [4]
Name: accept, dtype: object
[4, 5, 4]
avg = 4
929261978417319936
84    [2]
89    [2]
94    [3]
Name: accept, dtype: object
[2, 2, 3]
avg = 2
929269864698601472
585    [3]
595    [2]
615    [1]
Name: accept, dtype: object
[3, 2, 1]
avg = 2
929270481613570048
200    [5]
230    [1]
Name: accept, dtype: object
[5, 1]
avg = 3
929273600774983680
401    [0]
412    [9]
419    [8]
Name: accept, dtype: object
[0, 9, 8]
avg = 6
929278572048478208
625    [5]
630    [3]
648    [5]
Name: accept, dtype: object
[5, 3, 5]
avg = 4
929279698491731969
94     [2]
99     [4]
104    [3]
Name: accept, dtype: object
[2, 4, 3]
avg = 3
929282634408132609
417    [9]
428    [8]
435    [8]
Name: accept, dtype: object
[9, 8, 8]
avg = 8
929289670500012032
124    [2]
138    [1]
Name: accept, dtype: object
[2, 1]
avg = 2
929291514584412160
248    [5]
23     [3]
28     [6]
Name: accept, dtype: object
[5, 3, 6]
avg = 5
929296747104522240
574    [3]
584    [4]
604    [4]

Name: accept, dtype: object
[3, 2, 5]
avg = 3
929916715496083456
40    [1, 7]
30    [1, 7]
40    [1, 9]
40    [1, 7]
40    [1, 7]
21    [1, 7]
Name: accept, dtype: object
[7, 7, 9, 7, 7, 7]
avg = 7
929925627104387072
360                   [6]
371                   [6]
378    [7, hard_to_parse]
Name: accept, dtype: object
[6, 6, 7]
avg = 6
929948193994850305
89    [1, 9]
79    [1, 8]
89    [1, 7]
89    [1, 8]
33    [1, 7]
89    [1, 5]
Name: accept, dtype: object
[9, 8, 7, 8, 7, 5]
avg = 7
929959313283670016
592    [3]
602    [3]
756    [3]
766    [3]
769    [3]
Name: accept, dtype: object
[3, 3, 3, 3, 3]
avg = 3
929964842747813889
47    [1, 4]
56    [1, 4]
56    [1, 3]
Name: accept, dtype: object
[4, 4, 3]
avg = 4
929965650260369408
94     [7]
108    [7]
Name: accept, dtype: object
[7, 7]
avg = 7
929972957627088896
102    [1, 8]
92     [1, 7]
102    [1, 8]
102    [1, 6]
36     [1, 6]
102    [1, 6]
Name: accept, dtype: object
[8, 7, 8, 6, 6, 6]
avg = 7
929979790186172416
92     [4]
106  

#### save the cleaned version

In [133]:
mean_severity_df = pd.DataFrame(rows)
mean_severity_df[mean_severity_df["round"]=="r4"]
mean_severity_df.to_csv("mean_severity.csv", index=False)

In [138]:
mean_severity_df

Unnamed: 0,tweet_id,round,text,rounded_avg_severity
0,916015738175082496,r4,The -JOHN DOE- story is an upsetting exclamati...,5
1,916025460156518401,r4,Stories about -JOHN DOE- reportedly circulated...,5
2,916037936516337665,r4,My personal hell will be watching -JOHN DOE- a...,5
3,916044166596935680,r4,Now -JOHN DOE- will be known as inmate 32666.,7
4,916055010240270337,r4,-JOHN DOE- is fucking disgusting. Oh how the m...,7
...,...,...,...,...
1288,930519051985702912,r4,Thought we all knew tht -JOHN DOE- is a narcis...,5
1289,930542203503509504,r4,Kinda shocked that I’m hearing everything I’m ...,3
1290,930561250232274944,r4,The last line of the first song off their last...,3
1291,930568409062035457,r1,Hey when I was 15 someone from a semi-relevant...,7


In [137]:
df = pd.read_csv("mean_severity.csv")
df.groupby("rounded_avg_severity").count()

Unnamed: 0_level_0,tweet_id,round,text
rounded_avg_severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,2,2
1,9,9,9
2,114,114,114
3,168,168,168
4,252,252,252
5,223,223,223
6,266,266,266
7,175,175,175
8,82,82,82
9,2,2,2


#### Creating train/test splits

In [141]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
data_dir = "severity_dataset_loading_script/"
file_name = "mean_severity.csv"
def create_train_test(data_dir, file_name):
    file_path = os.path.join(data_dir, file_name )
    ds_df = pd.read_csv(file_path)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    sss.get_n_splits(ds_df, ds_df["rounded_avg_severity"])
    print(sss)
#     print(ds_df)
    for train_index, test_index in sss.split(ds_df, ds_df["rounded_avg_severity"]):
        print(train_index)
        train_df= ds_df.iloc[train_index,:]
        test_df= ds_df.iloc[test_index,:]

#         test_df = ds_df[test_index]
    return train_df, test_df
train_df, test_df = create_train_test(data_dir, file_name)

StratifiedShuffleSplit(n_splits=1, random_state=0, test_size=0.2,
            train_size=None)
[ 522  328 1150 ...  966 1122  479]


In [144]:
sum(train_df.rounded_avg_severity == 5)/len(train_df.rounded_avg_severity)

0.172147001934236

In [146]:
sum(test_df.rounded_avg_severity == 5)/len(test_df.rounded_avg_severity)

0.17374517374517376

In [148]:
ARE_U_SURE = False
if ARE_U_SURE:
    train_df.to_csv("severity_dataset_loading_script/severity_train.csv", index=False)
    test_df.to_csv("severity_dataset_loading_script/severity_test.csv", index=False)
else:
    print("This will regenerate the test set! are you sure?")

This will regenerate the test set! are you sure?


-----------------------------------

# plan to clean
- [ ] go over rounds, on each round collect all items
- [ ] take majority vote (look into the paper and see what the did)
- [ ] based on tweet id, extract text, maj_vote, tweet_id