This notebook can be used if you want to combine multiple runs together. Put each run in a CSV file and run the code.

In [None]:
# oneshotCoT
import ollama
import csv
import pandas as pd
import numpy

runs = {}
methodRuns = 11
zeroshot = 'runs/zeroShotRuns/run'
oneshot = 'runs/oneShotRuns/run'
fewshot = 'runs/fewShotRuns/run'
zeroShotCoT = 'runs/zeroShotCoTRuns/run'
oneShotCoT = 'runs/oneShotCoTRuns/run'
fewShotCoT = 'runs/fewShotCoTRuns/run'

# this part reads each csv to a dataframe and ensures only the labels are copied
current = zeroshot
for n in range(1, 11):
    csvPath = current + str(n) + '.csv'
    res = pd.read_csv(csvPath, sep='{', header=None)
    res.columns = ["rows", "text"]
    res[["labels", "num"]] = res["text"].str.split("}", expand=True)
    runs["response" + str(n)] = res[["labels"]].copy()

display(runs["response2"])

In [None]:
# From the response we create a dataframe in the same format as our annotated labels
# First we need a list of labels
labelsToAnnotate: str = "admiration;amusement;anger;annoyance;\"approval;caring;confusion;curiosity;desire;disappointment;disapproval;disgust;embarrassment;excitement;fear;gratitude;grief;joy;love;nervousness;optimism;pride;realization;relief;remorse;sadness;suprise;neutral"
labelsList = labelsToAnnotate.split(";")

# Then for each method run
for n in range (1, methodRuns):
   # Sometimes the LLM returns the labels with uppercases, these should be changed to lowercase
   runs["response" + str(n)]["labels"] = runs["response" + str(n)]["labels"].apply(lambda x: x.lower())
   # We go over each of these labels
   for label in labelsList:
      # And add a column with the correct name by d[][label]
      # Then for each row we either assign 1 if the label shows up in the text and 0 if the label does not show up in the text at all.
      # As the LLM sometimes gives emotions in the text followed by :0, those are filtered out. Other unneeded info is filtered likewise.
      runs["response" + str(n)][label] = runs["response" + str(n)].apply(lambda row: 0 if label + "\": 0," in row.labels else 0 if label + "\":0," in row.labels else 0 if label + "\": false," in row.labels else 1 if label in row.labels else  0, axis=1) 

# transform the result data to a numpy array to do evaluation on
frameResults = {}
numResults = {}

# For each method run
for n in range (1,methodRuns):
    # We drop the unneeded columns, which are the labels given by the LLM and a empty column it adds
    frameResults["response" + str(n)] = runs["response" + str(n)].drop(['labels'], axis=1)
    # then we turn the dataframe into a numpy array
    numResults["results" + str(n)] = pd.DataFrame.to_numpy(frameResults["response" + str(n)])

sumResults =  numResults["results1"]
for n in range (2, methodRuns):
    sumResults = numpy.add(sumResults, numResults["results" + str(n)])

# this printed array can be copied over to the resultEvaluation notebook
print(repr(sumResults))

To calculate the mean and standard deviation per run, use the following code.

In [None]:
import numpy as np
import pandas as pd
import csv
from statsmodels.stats import inter_rater as irr
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
# The methods need to be compared to the annotated data, for which we need to get the proper array
# read the annotated files to dataframes
annotated1 = pd.read_csv('annotatedData/Emotion-1.csv', sep=';', header=0)
annotated2 = pd.read_csv('annotatedData/Emotion-2.csv', sep=';', header=0)
annotated3 = pd.read_csv('annotatedData/Emotion-3.csv', sep=';', header=0)
annotated4 = pd.read_csv('annotatedData/Emotion-4.csv', sep=',', header=0)
annotated5 = pd.read_csv('annotatedData/Emotion-5.csv', sep=',', header=0)
# drop the not needed columns. We care only about the labels, not the text.
dropList = ["id", "question_id", "participant_id", "frisian", "dutch", "english"]
annotated1Clean = annotated1.drop(dropList, axis=1)
annotated2Clean = annotated2.drop(dropList, axis=1)
annotated3Clean = annotated3.drop(dropList, axis=1)
annotated4Clean = annotated4.drop(dropList, axis=1)
annotated5Clean = annotated5.drop(dropList, axis=1)
# display(annotated1Clean)
# combine the data frames
annotatedDataFrame12 = annotated1Clean.add(annotated2Clean, fill_value=0)
annotatedDataFrame34 = annotated3Clean.add(annotated4Clean, fill_value=0)
annotatedDataFrame345 = annotatedDataFrame34.add(annotated5Clean, fill_value=0)
annotatedDataFrame = annotatedDataFrame345.add(annotatedDataFrame12, fill_value=0)
# replace the NaN with 0
annotatedDataFrameNoNaN = annotatedDataFrame.fillna(0)
# transform the annotated data to a numpy array to do the evaluations with
numAnnotatedSummed = pd.DataFrame.to_numpy(annotatedDataFrameNoNaN.iloc[0:50])
# Then a threshold is set on which labels are considered "true"
numAnnotated = np.where(numAnnotatedSummed < 1, 0, 1)
# print(numAnnotated)
labelsToAnnotate: str = "admiration;amusement;anger;annoyance;approval;caring;confusion;curiosity;desire;disappointment;disapproval;disgust;embarrassment;excitement;fear;gratitude;grief;joy;love;nervousness;optimism;pride;realization;relief;remorse;sadness;suprise;neutral"
labelsList = labelsToAnnotate.split(";")
# First the methods needed for the evaluation
# This method can be applied to a dataframe to get the actual labels instead of just numbers
def getLabels(data):
    result = ""
    labelsToAnnotate: str = "admiration;amusement;anger;annoyance;approval;caring;confusion;curiosity;desire;disappointment;disapproval;disgust;embarrassment;excitement;fear;gratitude;grief;joy;love;nervousness;optimism;pride;realization;relief;remorse;sadness;suprise;neutral"
    labelsList = labelsToAnnotate.split(";")
    for label in labelsList:
        if data[label] >= 1:
            result = result + ", " + label
    return result
def getNumLabels(resultArray, n):
    return np.count_nonzero(resultArray >= n)
def calculateAgreement(resultArray):
    agg = irr.aggregate_raters(resultArray) # returns a tuple (data, categories)
    resultOneShot = irr.fleiss_kappa(agg[0], method='fleiss')
    return resultOneShot
def calcF1RecallPrecision(annoArray, predictArray):
    eval = np.empty(shape=(0,3))
    f1Score = f1_score(annoArray, predictArray, average="micro")
    recall = recall_score(annoArray, predictArray, average="micro")
    precision = precision_score(annoArray, predictArray, average="micro")
    return np.append(eval, [f1Score, recall, precision])
def calcPrecision(annoArray, predictArray):
    return precision_score(annoArray, predictArray, average="micro")

In [None]:
import statistics
# for zeroshot, use this
numAnno = numAnnotated
# for oneshot you should use this, as row 4 is used as an example
# numAnno = np.delete(numAnnotated, 4, axis=0)
# for fewshot you should use this, as row 4,6,21 are used as an example
# numAnno = np.delete(numAnnotated, [4, 6, 21], axis=0)
# numresults holds the 10 different runs
evalTotal = np.empty(shape=(10,3))
precLabels = []
numLabels = []
for n in range (1, 11):
    # for each run calculate the f1, recall and precision
    evalTotal[n-1] = calcF1RecallPrecision(numAnno, numResults["results" + str(n)])
    # also calculate the number of labels
    numLabels.append(getNumLabels(numResults["results" + str(n)], 1))
    # also the precision
    precLabels.append(calcPrecision(numAnno, numResults["results" + str(n)]))
# With the precision and total labels the correct and incorrect labels can be calculated
correctLabels = np.multiply(precLabels, numLabels)
incorrectLabels = np.subtract(numLabels, correctLabels)
precMean = statistics.mean(precLabels)
precDev = np.std(precLabels)
# this prints the 
print(precLabels)
print(precMean)
print(precDev)
# Next is calculating the mean and the standard deviation
meanLabels = statistics.mean(numLabels)
stDev = np.std(numLabels)
meanLabelsCor = statistics.mean(correctLabels)
stDevCor = np.std(correctLabels)
meanLabelsIncor = statistics.mean(incorrectLabels)
stDevinCor = np.std(incorrectLabels)
# Now print the total amount of labels
print(numLabels)
print(meanLabels)
print(stDev)
# And the correct labels
print(correctLabels)
print(meanLabelsCor)
print(stDevCor)
# and the incorrect labels
print(incorrectLabels)
print(meanLabelsIncor)
print(stDevinCor)