In this notebook a LLM is used to detect emotions in text. There are multiple parameters you can change for different runs, which can be set below.

In [None]:
# The results are saved to a text file, which should be renamed when you have run the entire notebook and want to do so again
testFile = "oneShotTestAggregateLargeB.txt"
# Next parameter is the amount of runs you want the method to do. As LLMs give different results each time, you should run it multiple times to 
# get more robust results. Setting it to n means n-1 runs
methodRuns = 11
# The last parameter to set is the threshold for the aggregating of the labels of the annotators
# A threshold of n means n different annotators choose this label for this example of text
annotatorThreshold = 2

Before the LLM is called, some data has to be prepared.
First we transform the CSV file with the unannotated data to a dataframe.
Next we create a string with the appropriate labels.
Lastly we create a dataframe from the annotated examples to test the accuracy with. This dataframe needs to become a numPy array. The annotated data is aggregated by majority vote.

In [None]:
import ollama
import csv
import pandas as pd
import numpy

# read the unannotated data file to a dataframe
unannotatedDataFrame = pd.read_csv('notAnnotatedData.csv', sep=';', header=0)
# create a string with all the labels
# approval has an ", as the predictions for disapproval otherwise also get counted for approval
labelsToAnnotate: str = "admiration;amusement;anger;annoyance;\"approval;caring;confusion;curiosity;desire;disappointment;disapproval;disgust;embarrassment;excitement;fear;gratitude;grief;joy;love;nervousness;optimism;pride;realization;relief;remorse;sadness;suprise;neutral"

# read the annotated files to dataframes
# 5 different annotators annotated the data, this has to be aggregated
annotated1 = pd.read_csv('annotatedData/Emotion-1.csv', sep=';', header=0)
annotated2 = pd.read_csv('annotatedData/Emotion-2.csv', sep=';', header=0)
annotated3 = pd.read_csv('annotatedData/Emotion-3.csv', sep=';', header=0)
annotated4 = pd.read_csv('annotatedData/Emotion-4.csv', sep=',', header=0)
annotated5 = pd.read_csv('annotatedData/Emotion-5.csv', sep=',', header=0)
# drop the not needed columns. Add does not work properly otherwise.
dropList = ["id", "question_id", "participant_id", "frisian", "dutch", "english"]
annotated1Clean = annotated1.drop(dropList, axis=1)
annotated2Clean = annotated2.drop(dropList, axis=1)
annotated3Clean = annotated3.drop(dropList, axis=1)
annotated4Clean = annotated4.drop(dropList, axis=1)
annotated5Clean = annotated5.drop(dropList, axis=1)

# combine the data frames
annotatedDataFrame12 = annotated1Clean.add(annotated2Clean, fill_value=0)
annotatedDataFrame34 = annotated3Clean.add(annotated4Clean, fill_value=0)
annotatedDataFrame345 = annotatedDataFrame34.add(annotated5Clean, fill_value=0)
annotatedDataFrame = annotatedDataFrame345.add(annotatedDataFrame12, fill_value=0)
# replace the NaN with 0, this is needed in order to calculate the F1 score, precision and recall later
annotatedDataFrameNoNaN = annotatedDataFrame.fillna(0)

# transform the annotated data to a numpy array to do the evaluations with
numAnnotatedSummed = pd.DataFrame.to_numpy(annotatedDataFrameNoNaN)
# Then a threshold is set on which labels are considered "true"
numAnnotated = numpy.where(numAnnotatedSummed < annotatorThreshold, 0, 1)

# row 0 till 49 are annotated
numpy.set_printoptions(threshold=30000)
print(numAnnotated)

Next we run the oneshot method on the dataset

In [None]:
from tqdm import tqdm
tqdm.pandas()

def oneShot(data) -> str:
    "oneshot method for detecting emotions. A single example is given to the LLM"
    response = ollama.chat(model='llama3', format='json',messages=[
    {
        'role': 'system',
        'content': """  Your task is to identify one to three key emotions in the text using the given labels. 
                        If no emotion is detected, use the label neutral.
                        Answer with the identified emotions in JSON format, without explanation.
                   """
    },
    {
        'role': 'user',
        'content': """  Text: I would like to see a large field of solar panels, worn by the inhabitants themselves, paid for by the inhabitants themselves, the moment a resident leaves for the area he should be able to sell his part back, new residents should also be able to buy a part. In this way, you as a municipality become self-sufficient while it is affordable for the vast majority of residents. Residents who would not be able to participate financially, must be able to use the solar energy, as they currently purchase electricity from their supplier.
                        \n Labels: """ + labelsToAnnotate 
    },
    {
        'role': 'assistant',
        'content': "Emotions: optimism, desire, caring"
    },
    {
        'role': 'user',
        'content': "Text: " + data.english + "\n Labels: " + labelsToAnnotate
    },
    ])
    # response is a mapping of: model, created at, message, done, total duration, load duration, prompt eval durationm eval count and eval duration
    # we only want to return the content of the message to be used 
    return pd.Series(response['message']['content'])

# we exclude row 4, as that was used for giving examples to the LLM.
# this example got clear labels from the annotators and provide a good overall understanding of the different emotions
test = unannotatedDataFrame.iloc[0:50]
smallTest = test.drop([4])

# apply runs the method on each row of the provided dataframe. Data.english is used to only provide the LLM with the english text
# progress_apply is used to see the progress of calling the method per row, as the LLM takes some time to run it is nice to see the progress

# to create more robust results, the method is run 10 times and the resulting labels are aggregated
# if you want to run the method less times, change the following variable: methodruns
runs = {}
for n in range (1, methodRuns):
    runs["response{0}".format(n)] = smallTest.progress_apply(oneShot, axis=1)
    runs["response" + str(n)].columns = ["labels"]
print(runs["response1"])

As we use apply, we get the response in a DataFrame format. We want to turn this into the same format as our annotated dataframe.

In [None]:
# From the response we create a dataframe in the same format as our annotated labels
# First we need a list of labels
labelsList = labelsToAnnotate.split(";")
# Then for each method run
for n in range (1, methodRuns):
   # Sometimes the LLM returns the labels with uppercases, these should be changed to lowercase
   runs["response" + str(n)]["labels"] = runs["response" + str(n)]["labels"].apply(lambda x: x.lower())
   # We go over each of these labels
   for label in labelsList:
      # And add a column with the correct name by d[][label]
      # Then for each row we either assign 1 if the label shows up in the text and 0 if the label does not show up in the text at all.
      # As the LLM sometimes gives emotions in the text followed by :null, those are filtered out. Other wrong formats can be filtered as such as well.
      runs["response" + str(n)][label] = runs["response" + str(n)].apply(lambda row: 0 if label + "\":null" in row.labels else 1 if label in row.labels else  0, axis=1) 

# This prints the results of the first run
print(runs["response1"])

The created dataframe needs to drop the labels column and is then the same format as our annotated example. 
After the column is dropped we transform it into a numPy array.

In [None]:
# transform the result data to a numpy array to do evaluation on
frameResults = {}
numResults = {}

# For each method run
for n in range (1,methodRuns):
    # We drop the unneeded columns, which are the labels given by the LLM
    frameResults["response" + str(n)] = runs["response" + str(n)].drop(['labels', ""], axis=1)
    # then we turn the dataframe into a numpy array
    numResults["results" + str(n)] = pd.DataFrame.to_numpy(frameResults["response" + str(n)])
# This prints the cleaned up numpy array results of the first method run
print(numResults["results1"])

# now we create and write the results to a file
# !! rename the file when new results are created !!
f = open(testFile, "a")
for n in range (1, methodRuns):
    f.write("Results " + str(n) + "\n")
    f.write(pd.DataFrame.to_string(runs["response" + str(n)]) + "\n")
    f.write(numpy.array_str(numResults["results" + str(n)]) + "\n")
    

Finally, we want to evaluate the results.

This is done by calculating the F1 score, recall and precision. 
As we do not have a fully annotated data set, the score will be calculated not over the entire dataset, but only the annotated part.
Also, as the LLM was run multiple times, those results must be aggregated.

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

# The evaluation metrics are calculated over the annotated rows.
# The LLM is run on row 0 till 50, excluding row 4 and produces results in rows 0 to 49.
numOnlyAnno = numAnnotated[0:50]
# as row 4 is used to give examples, it is removed for the evaluation
numOnlyAnno = numpy.delete(numOnlyAnno, 4, axis=0)

# Now we need to aggregate the results from the different method runs
# We start by adding all the numpy arrays together
sumResults =  numResults["results1"]
for n in range (2, methodRuns):
    sumResults = numpy.add(sumResults, numResults["results" + str(n)])
print(sumResults)
f.write("Aggregated results" + "\n" + numpy.array_str(sumResults))
f.write("Annotator threshold: " + str(annotatorThreshold))
f.write("Number of runs: " + str(methodRuns))
# once they are summed, a threshold is applied
# This threshold represents the amount of runs that need to have given a label for a piece of text in order to be considered.
for n in range (1, methodRuns):
    sumThreshResult = numpy.where(sumResults < n, 0, 1)
    print("Threshold: " + str(n))
    f.write("\nThreshold: " + str(n))
    # Now that all of the data is properly prepared, the micro F1 score, recall and precision are calculated
    f1Score = f1_score(numOnlyAnno, sumThreshResult, average="micro")
    recall = recall_score(numOnlyAnno, sumThreshResult, average="micro")
    precision = precision_score(numOnlyAnno, sumThreshResult, average="micro")
    print("micro f1 score: " + str(f1Score))
    print("recall: " + str(recall))
    print("precision: " + str(precision))
    f.write("\nMicro F1 score: " + str(f1Score))
    f.write("\nMicro recall: " + str(recall))
    f.write("\nMicro precision: " + str(precision))
f.close()
