## **Assignment 2**
## CS585 - NLP
## Oleksandr Shashkov

In [31]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import cohen_kappa_score
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Part 1: Evaluate annotator agreement

In [32]:
path = "/content/drive/MyDrive/Education/CS585/HW2/"
files = [
         "nyt_topic_0_lockdowns.csv",
         "nyt_topic_1_lockdowns.csv",
         "nyt_topic_2_lockdowns.csv",
         "nyt_topic_0_masking_and_distancing.csv",
         "nyt_topic_1_masking_and_distancing.csv",
         "nyt_topic_2_masking_and_distancing.csv",
         "nyt_topic_0_vaccination.csv",
         "nyt_topic_1_vaccination.csv",
         "nyt_topic_2_vaccination.csv",
         "twitter_topic_0_lockdowns.csv",
         "twitter_topic_1_lockdowns.csv",
         "twitter_topic_2_lockdowns.csv",
         "twitter_topic_0_masking_and_distancing.csv",
         "twitter_topic_1_masking_and_distancing.csv",
         "twitter_topic_2_masking_and_distancing.csv",
         "twitter_topic_0_vaccination.csv",
         "twitter_topic_1_vaccination.csv",
         "twitter_topic_2_vaccination.csv"         
         ]

# nested dictionary to maintain kappa scores for each annotator for every file
dict_kappas = {}

#iterate through all the files to form a nested dictionary with all the kappas
for file in files:
  dict_kappas[file] ={}
  data = pd.read_csv(path + file)#get the data from the file loaded

  annotators = data.columns[1:].values.tolist()
  kappas = np.zeros((len(annotators),len(annotators)))
  #compute all the kappas
  for i in range(0, len(annotators)):
    for j in range(0,len(annotators)):
      if i!=j:
        kappas[i,j] = cohen_kappa_score(data[annotators[i]], data[annotators[j]])

  #calculate average kappas
  kappas_ave = sum(kappas)/(len(annotators)-1)
  #pack it into nested dictionary
  zip_iterator = zip(annotators, kappas_ave)
  dict_kappas[file] = dict(zip_iterator)

print("Average kappa scores for each annotator for every topic file:")
for file in dict_kappas:
  print(file + ":")
  for ant in dict_kappas[file]:
    print("  " + ant + ": " + str(dict_kappas[file][ant]))

Average kappa scores for each annotator for every topic file:
nyt_topic_0_lockdowns.csv:
  annotation_41: 0.3865669044318172
  annotation_67: 0.41104152550015244
  annotation_19: 0.4459637258672724
  annotation_55: 0.5280749200206049
nyt_topic_1_lockdowns.csv:
  annotation_65: 0.38852720260388834
  annotation_66: 0.38068806455874454
  annotation_68: 0.4584365385007858
  annotation_69: 0.40820392347279305
nyt_topic_2_lockdowns.csv:
  annotation_40: 0.5395813559311737
  annotation_8: 0.2852685594864206
  annotation_42: 0.5042676574197813
  annotation_15: 0.5118571375739781
nyt_topic_0_masking_and_distancing.csv:
  annotation_41: 0.6178689693623323
  annotation_67: 0.6793745080912982
  annotation_19: 0.559986544802163
  annotation_55: 0.6905681829759921
nyt_topic_1_masking_and_distancing.csv:
  annotation_65: 0.717961781836577
  annotation_66: 0.81615480779507
  annotation_68: 0.7796374195875933
  annotation_69: 0.7608777556872485
nyt_topic_2_masking_and_distancing.csv:
  annotation_40: 0

## Part 2: Assemble datasets

In [33]:
kappa_treshold = 0.2

for file, kappas in dict_kappas.items():
  print("processing file " + file + ":")
  data = pd.read_csv(path + file)#get the data from the file in
  max_key = max(kappas, key=kappas.get)#remember the best annotator

  #this loop is for eliminating por annotations
  for anot, kappa in kappas.items():
    if kappa < kappa_treshold:
      print(" dropping " + anot + " with kappa = " + str(kappa))
      data.drop(columns=[anot],inplace=True)
  
  # voting - majority or the best annotator on the split
  mode = data.iloc[:,1:].mode(axis=1)
  data['label'] = np.where(mode[1].isna(), mode[0], data[max_key])

  #save the data with best labels
  data.loc[:,['text','label']].to_csv(path + Path(file).stem + "_mod.csv", index=False)
  print("Done")

processing file nyt_topic_0_lockdowns.csv:
Done
processing file nyt_topic_1_lockdowns.csv:
Done
processing file nyt_topic_2_lockdowns.csv:
Done
processing file nyt_topic_0_masking_and_distancing.csv:
Done
processing file nyt_topic_1_masking_and_distancing.csv:
Done
processing file nyt_topic_2_masking_and_distancing.csv:
Done
processing file nyt_topic_0_vaccination.csv:
Done
processing file nyt_topic_1_vaccination.csv:
Done
processing file nyt_topic_2_vaccination.csv:
Done
processing file twitter_topic_0_lockdowns.csv:
 dropping annotation_104 with kappa = 0.15310922941528948
 dropping annotation_102 with kappa = 0.1699956795349352
Done
processing file twitter_topic_1_lockdowns.csv:
Done
processing file twitter_topic_2_lockdowns.csv:
 dropping annotation_56 with kappa = 0.1163835813527575
 dropping annotation_58 with kappa = 0.19539052471284699
Done
processing file twitter_topic_0_masking_and_distancing.csv:
Done
processing file twitter_topic_1_masking_and_distancing.csv:
Done
processin

## Combine files 

In [34]:
# these are the names of temporary files created on the previous step
mod_files = [
         "nyt_topic_0_lockdowns_mod.csv",
         "nyt_topic_1_lockdowns_mod.csv",
         "nyt_topic_2_lockdowns_mod.csv",
         "nyt_topic_0_masking_and_distancing_mod.csv",
         "nyt_topic_1_masking_and_distancing_mod.csv",
         "nyt_topic_2_masking_and_distancing_mod.csv",
         "nyt_topic_0_vaccination_mod.csv",
         "nyt_topic_1_vaccination_mod.csv",
         "nyt_topic_2_vaccination_mod.csv",
         "twitter_topic_0_lockdowns_mod.csv",
         "twitter_topic_1_lockdowns_mod.csv",
         "twitter_topic_2_lockdowns_mod.csv",
         "twitter_topic_0_masking_and_distancing_mod.csv",
         "twitter_topic_1_masking_and_distancing_mod.csv",
         "twitter_topic_2_masking_and_distancing_mod.csv",
         "twitter_topic_0_vaccination_mod.csv",
         "twitter_topic_1_vaccination_mod.csv",
         "twitter_topic_2_vaccination_mod.csv"         
         ]

#these are the names of final output files
final_files = [
         "nyt_topic_lockdowns.csv",
         "nyt_topic_masking_and_distancing.csv",
         "nyt_topic_vaccination.csv",
         "twitter_topic_lockdowns.csv",
         "twitter_topic_masking_and_distancing.csv",
         "twitter_topic_vaccination.csv"
         ]

for index, file in enumerate(final_files):
  print("combining final file " + file + " from:")
  #get new and empty dataframe for each final file firts
  data = pd.DataFrame()
  for i in range(3):
    fl_indx = index*3+i
    print("  " + mod_files[fl_indx])
    #get the data from the file loaded
    df = pd.read_csv(path + mod_files[fl_indx])
    #add it to the dataframe
    data = pd.concat([data, df], ignore_index=True)
  #save the final version of file
  data.to_csv(path + file, index=False)
  print("Done!")


combining final file nyt_topic_lockdowns.csv from:
  nyt_topic_0_lockdowns_mod.csv
  nyt_topic_1_lockdowns_mod.csv
  nyt_topic_2_lockdowns_mod.csv
Done!
combining final file nyt_topic_masking_and_distancing.csv from:
  nyt_topic_0_masking_and_distancing_mod.csv
  nyt_topic_1_masking_and_distancing_mod.csv
  nyt_topic_2_masking_and_distancing_mod.csv
Done!
combining final file nyt_topic_vaccination.csv from:
  nyt_topic_0_vaccination_mod.csv
  nyt_topic_1_vaccination_mod.csv
  nyt_topic_2_vaccination_mod.csv
Done!
combining final file twitter_topic_lockdowns.csv from:
  twitter_topic_0_lockdowns_mod.csv
  twitter_topic_1_lockdowns_mod.csv
  twitter_topic_2_lockdowns_mod.csv
Done!
combining final file twitter_topic_masking_and_distancing.csv from:
  twitter_topic_0_masking_and_distancing_mod.csv
  twitter_topic_1_masking_and_distancing_mod.csv
  twitter_topic_2_masking_and_distancing_mod.csv
Done!
combining final file twitter_topic_vaccination.csv from:
  twitter_topic_0_vaccination_mod.