In [1]:
import pandas as pd 
import sys
sys.path.append('../../src/data_preprocessing/')

from process_doccano_output import main

class Args: 
    def __init__(self, output_folder_path, output_df_path=None, 
                 drop_bottom_level=None): 
        self.output_folder_path = output_folder_path
        self.output_df_path = output_df_path
        self.drop_bottom_level = drop_bottom_level
        
args = Args('/Users/rt853/repos/UoB/bath-persuasion-detection/data/phase_2_data/output_from_doccano/output_phase_2_partial_complete',
            None,
            True)

df = main(args)


In [2]:
labels = [
    '1-RAPPORT',
    '2-NEGOTIATE',
    '3-EMOTION',
    '4-LOGIC',
    '5-AUTHORITY',
    '6-SOCIAL',
    '7-PRESSURE',
    '8-NO-PERSUASION'
]

label2idx = {label : idx for idx, label in enumerate(labels)}


In [3]:
label2idx


{'1-RAPPORT': 0,
 '2-NEGOTIATE': 1,
 '3-EMOTION': 2,
 '4-LOGIC': 3,
 '5-AUTHORITY': 4,
 '6-SOCIAL': 5,
 '7-PRESSURE': 6,
 '8-NO-PERSUASION': 7}

In [8]:
import numpy as np
def one_hot_vector(annotation, label2idx):
    empty_vec = np.zeros(len(label2idx))
    for label in annotation.split('/'): 
        if label == '8-NO':
            label = '8-NO-PERSUASION'
        empty_vec[label2idx[label]] = 1
    return empty_vec 


In [9]:
ag = {}
for user in df.columns[1:]:
    ag[user] = []
    user_labels = df[user]
    
    for label in user_labels:
        vectors = one_hot_vector(label, label2idx)
        ag[user].append(vectors)


In [10]:
import re 

def clean_text(text):
    text = re.sub('Victim: ', '', text)
    text = re.sub('Persuader: ', '', text)
    return text 


In [11]:
from torch.nn import Softmax
import torch 

softmax = Softmax(dim=0)
idx2label = {idx : label for label, idx in label2idx.items()}
vectorised = pd.DataFrame(ag)

for idx, row in vectorised.iterrows():
    vectorised.at[idx, 'gold'] = idx2label[softmax(torch.Tensor(sum(row.values))).argmax().item()]
    
vectorised['text'] = df['text'].apply(lambda x : clean_text(x))
vectorised_w_users= vectorised[['text', 'gold'] + [col for col in vectorised.columns if col != 'gold' and col != 'text']]



In [12]:
# vectorised.to_csv('../../data/phase_2_data/processed/phase_2_partial_complete.csv', index=False)


In [18]:
def cosine_similarity(vector_a, vector_b):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    return dot_product / (norm_a * norm_b)


In [40]:
def measure_agreement(df, user_1, user_2):

    avg_cosine_similarity = 0 
    user_a_vectors = []
    user_b_vectors = []

    for vec_1, vec_2 in zip(
        df[user_1].values,
        df[user_2].values):
            
        avg_cosine_similarity += cosine_similarity(vec_1, vec_2)
        user_a_vectors.extend(list(vec_1))
        user_b_vectors.extend(list(vec_2))
    avg_cosine_similarity /= len(vectorised_w_users)
    
    return user_a_vectors, user_b_vectors, avg_cosine_similarity


In [73]:
import krippendorff
from sklearn.metrics import cohen_kappa_score

def calc_observed_agreement(user_1_vec, user_2_vec):
    
    agreement = 0
    num_items = 0
    for val_1, val_2 in zip(user_1_vec, user_2_vec):
        
        if val_1 == val_2:
            agreement += 1
            num_items += 1
        
        else:
            num_items += 1
    return agreement / num_items


In [85]:
user_map = {}
user_map['metrics'] = ['kripp_alpha', 'cohen_kappa', 'observed_agreement']

for user_1 in vectorised_w_users.columns[2:]:
    for user_2 in vectorised_w_users.columns[2:]:
        
        pair = user_1 + '->' + user_2
        
        if (user_2 + '->' + user_1) in user_map:
            continue
        
        if user_2 == user_1:
            continue 
        
        user_a_vectors, user_b_vectors, avg_cosine_similarity = measure_agreement(vectorised_w_users, user_1, user_2)
        
        kripp_a = round(krippendorff.alpha([user_a_vectors , user_b_vectors]),4)
        kappa = round(cohen_kappa_score(user_a_vectors, user_b_vectors),4)
        observed_agreement = round(calc_observed_agreement(list(user_a_vectors), list(user_b_vectors)),4)
        user_map[pair] = (kripp_a, kappa, observed_agreement)
                
                


In [86]:
pd.DataFrame(user_map).to_csv('../../data/phase_2_data/processed/agreement_metrics_phase_2_partial_complete.csv', index=False)
