In [3]:
import json
import pickle
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [18]:
data_dir = "./severity_dumps/"
filenames = os.listdir(data_dir)
filenames

['ta_joe_severity_r4',
 'ta_negin_severity_r5',
 'ta_sanjna_severity_r4_review',
 'ta_joe_severity_r1',
 'ta_xiaoyan_severity_r6',
 'ta_jillian_severity_r2',
 'ta_negin_severity_r4_review',
 'ta_xiaoyan_severity_r6_review',
 'ta_joe_severity_r2',
 'ta_xiaoyan_severity_r3',
 'ta_xiaoyan_severity_r5',
 'ta_sanjna_severity_r1',
 'ta_nour_severity_r1',
 'ta_joe_target_severity_r3',
 'ta_sanjna_severity_r3',
 'ta_sanjna_severity_r6_review',
 'ta_sanjna_severity_r2',
 'ta_mcgill_severity_r4_review',
 'ta_sanjna_severity_r4',
 'ta_nour_severity_r2',
 'ta_negin_severity_r1',
 'ta_sanjna_severity_r5',
 'ta_jillian_severity_r1',
 'ta_sanjna_severity_r6',
 'ta_negin_severity_r3',
 'ta_xiaoyan_severity_r1',
 'ta_joe_severity_training_r1',
 'ta_xiaoyan_severity_r4_review',
 'ta_xiaoyan_severity_r4',
 'ta_xiaoyan_severity_r2',
 'ta_joe_severity_r3',
 'ta_negin_severity_r4',
 'ta_negin_severity_r2']

In [11]:
def load_ds_by_keyword(data_dir, keyword, cols=['text', 'meta', 'label', 'accept', 'answer']):
    filenames = os.listdir(data_dir)
    filtered_fns = [name for name in filenames if keyword in name]
    datasets = {}
    for filename in filtered_fns:
        ds = pd.DataFrame(pickle.load(open(os.path.join(data_dir, filename), "rb")))
        datasets[filename] = ds
#         print(ds.columns)
#         print("{} has {} annotations".format(filename, len(datasets[filename])))
    return datasets

In [71]:
# Cleaning annotations
round_data_dict = {}
for i in tqdm(range(1, 7)):
    round_title = "r"+str(i)
    round_data_dict[round_title] = load_ds_by_keyword(data_dir, round_title)
    keys_to_pop = []
    for annotator_file in round_data_dict[round_title].keys():
#         print(annotator_file)
    
        if len(annotator_file.split("_"))>4:
            print("*"*20)
            print("skipping {}".format(annotator_file))
            print("*"*20)
            keys_to_pop.append(annotator_file)
            continue
        
        curr_df = round_data_dict[round_title][annotator_file] 
        if "meta" not in curr_df.columns:
            print("*"*20)
            print("{} has no meta but has {} records".format(annotator_file, len(curr_df)))
            print("*"*20)
            keys_to_pop.append(annotator_file)
            continue
        tweet_ids = [item["tweet_id"] for item in curr_df["meta"]]
        curr_df["tweet_id"] = tweet_ids
        annotator = annotator_file.split("_")[1]
        curr_df["annotator"] = [annotator]*len(curr_df)
        curr_df["round"] = [round_title]*len(curr_df)

        round_data_dict[round_title][annotator_file] = curr_df[["tweet_id", "annotator", "round", "text", "label", "accept"]]
    for key in keys_to_pop:
        round_data_dict[round_title].pop(key)

100%|██████████| 6/6 [00:00<00:00, 25.72it/s]

********************
skipping ta_joe_severity_training_r1
********************
********************
skipping ta_joe_target_severity_r3
********************
********************
ta_joe_severity_r3 has no meta but has 0 records
********************
********************
skipping ta_sanjna_severity_r4_review
********************
********************
skipping ta_negin_severity_r4_review
********************
********************
skipping ta_mcgill_severity_r4_review
********************
********************
skipping ta_xiaoyan_severity_r4_review
********************
********************
ta_negin_severity_r5 has no meta but has 0 records
********************
********************
ta_xiaoyan_severity_r5 has no meta but has 0 records
********************
********************
ta_sanjna_severity_r5 has no meta but has 0 records
********************
********************
skipping ta_xiaoyan_severity_r6_review
********************
********************
skipping ta_sanjna_severity_r6_review
***********




In [74]:
round_data_dict["r1"]["ta_xiaoyan_severity_r1"]
    

Unnamed: 0,tweet_id,annotator,round,text,label,accept
0,925856978475171841,xiaoyan,r1,Here we go again. Yet another LIBERAL accused ...,1,"[1, 6]"
1,930292240416731137,xiaoyan,r1,I remember when everyone was dickriding -JOHN ...,1,"[1, 7]"
2,918439998898294784,xiaoyan,r1,"I'm a Matt Damon fan, can't even believe he wa...",1,"[1, 2]"
3,921475052125614080,xiaoyan,r1,Darn it. I was hoping a generation of tech emp...,1,"[1, 4]"
4,929109853267746818,xiaoyan,r1,"Moore, <OTHER TARGET 2>, -JOHN DOE-, Mueller, ...",1,"[1, 3]"
...,...,...,...,...,...,...
140,929015378600124418,xiaoyan,r1,"If proven true, id not only disqualify Moore, ...",1,"[1, 8]"
141,929176298630647808,xiaoyan,r1,"<OTHER TARGET 2>, Polansky, Allen, -JOHN DOE-,...",1,"[1, 3]"
142,929176298630647808,xiaoyan,r1,"-JOHN DOE-, Polansky, Allen, <OTHER TARGET 1>,...",1,"[1, 3]"
143,922904919081963520,xiaoyan,r1,"""-JOHN DOE-"" is the kind of person we don't ne...",1,"[1, 8]"


In [28]:
def get_tweet_ids(df):
    print(df.columns)
    print("total_records {}".format(len(df)))
    if "label" in df.columns:
        print("label==1 {}".format(sum(df.label)))
    else:
        print("no label col")
#         print(df.columns)
        print("answer==accept {}".format(sum(df.answer=="accept")))

    print("-"*30)
    try:
        tweet_ids = [m["tweet_id"] for m in df.meta]
        df["tweet_id"] = tweet_ids
        return tweet_ids
    except:
        return []

In [58]:

# curr_df["tweet_id"] = curr_df["meta"]["tweet_id"]
round_data_dict["r1"]["ta_joe_severity_r1"].keys()

Index(['tweet_id', 'annotator', 'text', 'label', 'accept'], dtype='object')

In [47]:
r1_datasets = {}
for filename in r1_filenames:
    
    r1_datasets[filename] = pickle.load(open(os.path.join(data_dir, filename), "rb"))
    print(filename, len(r1_datasets[filename]))

NameError: name 'r1_filenames' is not defined

In [18]:
r1_datasets.keys()

dict_keys(['ta_joe_severity_r1', 'ta_sanjna_severity_r1', 'ta_nour_severity_r1', 'ta_negin_severity_r1', 'ta_jillian_severity_r1', 'ta_xiaoyan_severity_r1', 'ta_joe_severity_training_r1'])

In [21]:
r1_datasets["ta_joe_severity_r1"][0].keys()


dict_keys(['text', 'meta', 'spans', 'label', 'accept', '_input_hash', '_task_hash', 'options', '_session_id', '_view_id', 'answer'])

In [23]:
r1_datasets["ta_joe_severity_r1"][0]["meta"]


{'tweet_id': '925856978475171841',
 'mask_map': {'<TARGET 1>': 'Michael Oreskes'},
 'pattern_match': True,
 'target_label': '<TARGET 1>',
 'target': 'Michael Oreskes'}

In [31]:
r1_datasets["ta_joe_severity_r1"][2]["accept"]


[1, '2']

# plan to clean
- [ ] go over rounds, on each round collect all items
- [ ] take majority vote (look into the paper and see what the did)
- [ ] based on tweet id, extract text, maj_vote, tweet_id