In [2]:
# Imports the function for calculating Cohen's kappa score 
from sklearn.metrics import cohen_kappa_score

In [3]:
import pandas as pd
import numpy as np

import re
import csv

Function for processing the file created by prompting Llama3 through the console:

In [4]:
def text_processing(filename):
    ''' 
    Opens the files created by prompting Llama3 through Powershell, and finds all instances of brackets, [],
    and adds these to a list.
    Then runs through each element in the list, which is now a string of numbers, and splits by comma
    and removes whitespace.
    `filename`: a textfile with output from prompting Llama3
    '''
    with open(filename, 'r', encoding='utf-16le') as infile:
        text = infile.read()
    
    bracket_list = re.findall(r'\[(.*?)\]', text)

    split_num_values = np.zeros((len(bracket_list),11))
    for i in range(len(bracket_list)):
        comma_sp = bracket_list[i].split(',')       # splits by comma
        x = np.zeros(11)
        for j in range(len(comma_sp)):
            x[j] = comma_sp[j].strip(' ')
        split_num_values[i] = x

    return split_num_values

### Creating instances for each file

Reading the pickle-file previosly created in as a dataframe

In [5]:
df = pd.read_pickle('df.pkl')

Converting this dataframe to an array to compare with the other values

In [6]:
ground_truth = df.iloc[0:, 2:-1].to_numpy()

In [7]:
run_1 = text_processing(r'C:\Users\maria\OneDrive - Universitetet i Oslo\VÅR2024\Textembedding\Drexl\llama_output\output_drexl_1.txt')
run_2 = text_processing(r'C:\Users\maria\OneDrive - Universitetet i Oslo\VÅR2024\Textembedding\Drexl\llama_output\output_drexl_2.txt')
run_3 = text_processing(r'C:\Users\maria\OneDrive - Universitetet i Oslo\VÅR2024\Textembedding\Drexl\llama_output\output_drexl_3.txt')

### Cohen's Kappa score for Llama3

Since the data from the Drexl-set can be categorized as more than one category we get a nested array containing the data. The function cohen_kappa_score cannot compute the score of nested arrays. The function below computes this score two different ways, first by line, compares each response to the ground truth, and then by category.

In [8]:
def nested_cohen(y1, y2):
    ''' 
    Computes Cohen's Kappa Score for a nested array by row and by column(category).
    `y1`: nested array
    `y2`: nested array
    '''
    # by row
    cohen_by_row = np.zeros(len(y1))
    for i in range(len(y1)):
        cohen_by_row[i] = cohen_kappa_score(y1[i],y2[i])

    # by column/category
    cohen_by_column = np.zeros(len(y1[0]))
    for i in range(len(y1[0])):
        y1_col = y1[:,i]
        y2_col = y2[:,i]
        cohen_by_column[i] = cohen_kappa_score(y1_col,y2_col)
        
    return cohen_by_row, cohen_by_column

We create a function to turn the output from the previous function into a dictionary containing values such as the mean, max and min

In [9]:
def data_to_dict(y1, y2):
    a, b = nested_cohen(y1, y2)
    datadict = {'Mean by line': np.mean(a), 
                'Max': np.max(a), 
                'Min': np.min(a), 
                'Standard deviation': np.std(a), 
                'First quantile': np.percentile(a, 25),
                'Median': np.percentile(a, 50),
                'Third quantile': np.percentile(a, 75),
                'Cohen\'s Kappa by category': b, 
                'Mean by category': np.mean(b)
                }
    
    return datadict

We run this function for each comparison we wish to make:

In [10]:
run_list = [ground_truth, run_1, run_2, run_3]
run_name = ['Ground truth', 'Run 1', 'Run 2', 'Run 3']

dict_list = []
for i in range(0,len(run_list)):
    for j in range(i+1, len(run_list)):
        d = data_to_dict(run_list[i], run_list[j])

        updict = {'Compared runs': f'{run_name[i]} and {run_name[j]}'}          # description of which runs are being compared    
        updict.update(d)                                                        # adds the dictionary as the first item
        dict_list.append(updict)

We then want to write this list of dictionaries to a csv-file:

In [11]:
# field names
fields = ['Compared runs', 'Mean by line', 'Max', 'Min', 'Standard deviation', 'First quantile',
          'Median', 'Third quantile', 'Cohen\'s Kappa by category', 'Mean by category']

filename = "IRR_results.csv"

with open(filename, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)         # creating a csv dict writer object
    
    writer.writeheader()                                        # writing headers (field names)
    writer.writerows(dict_list)                                 # writing data rows

### Cohen's Kappa Score for Centroid Method

In [12]:
embed_scores = np.load('scores.npy')

In [13]:
data_to_dict(ground_truth, embed_scores)

{'Mean by line': -0.26860878797408866,
 'Max': 0.6451612903225807,
 'Min': -0.9836065573770489,
 'Standard deviation': 0.20448113464279336,
 'First quantile': -0.3943661971830985,
 'Median': -0.22222222222222232,
 'Third quantile': -0.17857142857142838,
 "Cohen's Kappa by category": array([-0.00994427, -0.15003322, -0.09740081, -0.19130821, -0.30168801,
        -0.12512907, -0.19146644, -0.09520236, -0.27550462, -0.09514134,
        -0.27660371]),
 'Mean by category': -0.1644929128806291}