# Config

In [13]:
import_location = 'datasets/goemotions/ekman_6_single_label/poc_subset/goemotions_6_single_poc.csv'
export_location = 'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/goemotions_6_single_poc_ai_labeled.csv'
overwrite_previous_labels = True

datasets = ['GoEmotions POC']
ai_models = ['Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B'] # 'Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B', 'GPT 4o-mini', 'o3-mini'

remove_others = True

sample_size = 50
random_seed = 418   #   I'm a teacup

from config_files.dataset_config import dataset as dataset_config
from config_files import gen_ai_config

In [14]:
import pandas as pd
import os
import ollama
from ollama import ResponseError
from openai import OpenAI

# Label Generation Functions

### Build Label Prompt

In [15]:
def build_label_prompt(dataset_id, text, event = None):
    from config_files import prompts

    prompt = prompts.prompt[dataset_id]['labels'].replace('<text>', text)

    if event:
        prompt = prompts.prompt[dataset_id]['labels w/ context'].replace('<context>', prompts.prompt[dataset_id]['context'][event])


    return prompt

### Generate AI Label

In [16]:
def generate_synthetic_label(dataset_details, genAI_details, label_prompt):
    if genAI_details["platform"] == "Ollama":
        response = ollama.chat(
            model=genAI_details["id"],
            messages=[{
                "role": "user",
                "content": label_prompt
            }]
        )

        return response["message"]["content"]

    elif genAI_details["platform"] == "OpenAI":
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

        response = client.chat.completions.create(
            model=genAI_details["id"],
            messages = [{
                "role": "user",
                "content": label_prompt,
            }],
            n=dataset_details["num_labelers"]
        )

        response_text = []
        for choice in response.choices:
            response_text.append(choice.message.content)

        return response_text

### Parse Label Response

In [17]:
def parse_label_response(response, dataset_details, cot_terminator = None):
    if cot_terminator:      #   Remove the reasoning layer text for Chain-of-Thought models
        response.find(cot_terminator)
        response = response[response.find(cot_terminator):]

    for label in dataset_details["label_list"]:
        if label.lower() in response.lower():
            return label

    # Label name not found, look for ID number
    for i in range(1, len(dataset_details["label_list"]) + 1):
        if str(i) in response:
            return dataset_details["label_list"][i-1]

    # Label not found
    CRED = '\33[91m'
    CEND = '\33[0m'
    print(f"{CRED}NO LABEL FOUND:{CEND} {response}")
    return None

### Label Record

In [18]:
def get_label(dataset_details, gen_ai_details, text, event=None):
    label_prompt = build_label_prompt(dataset_details['id'], text, event)

    labels = []
    responses = []
    print("Labels: ", end="")
    if gen_ai_details['platform'] == "Ollama":

        for i in range(dataset_details['num_labelers']):
            label_response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
            responses.insert(len(responses), label_response)
            labels.append(parse_label_response(label_response, dataset_details, gen_ai_details['terminator']))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")

    elif gen_ai_details['platform'] == "OpenAI":
        responses = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
        for i, response in enumerate(responses):
            labels.append(parse_label_response(response, dataset_details))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")

    consensus_label = None

    if dataset_details["label_format"] == "single":
        # Single label dataset
        for potential_label in dataset_details["label_list"]:
            if labels.count(potential_label) >= dataset_details["num_consensus"]:
                consensus_label = potential_label
                print(f"\n\tConsensus: {consensus_label}")

    elif dataset_details["label_format"] == "multi":
        # To be implemented if used with any multilabel datasets
        pass

    return consensus_label, labels, responses

# Main Loop

In [19]:
for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(import_location)    #  Load dataset

    for ai_model in ai_models:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info

        if overwrite_previous_labels is False and ai_details['id'] in df_dataset.columns:
            continue

        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels

        os.makedirs(f'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/ai_responses/{ai_model}/', exist_ok=True)

        for index, row in df_dataset.iterrows():
            successful = False
            with open(f'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/ai_responses/{ai_model}/{dataset_details['id']}_{index}.txt', 'w', encoding="utf-8") as file:
                while not successful:
                    try:
                        if 'context' in df_dataset.columns:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text, row.event)
                        else:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text)

                    except ResponseError:  #   Recover from a CUDA illegal memory access error
                        from time import sleep
                        sleep(5)

                    else:
                        df_dataset.at[index, ai_details['id']] = label
                        for line in responses:
                            file.write(f"{line}\n")
                        successful = True

        display(df_dataset)


Labels: joy, joy, joy
	Consensus: joy
Labels: [91mNO LABEL FOUND:[0m Based on the provided text and emotion definitions, I would identify the emotion expressed by the writer as:

**Gratitude**

The writer expresses appreciation and thankfulness towards the person who provided the information (i.e., "Thanks for that"). This indicates a positive emotional tone, which aligns with the definition of gratitude.
None, joy[91mNO LABEL FOUND:[0m The emotion most expressed by the writer of this text is gratitude.

Specifically, the writer is expressing approval and appreciation (gratitude) towards the original poster's response, using phrases such as "I'm gonna save your answer" and "Thanks for that". This suggests a positive emotional tone, indicating that the writer values the information provided and is thankful for it.
, NoneLabels: anger, anger, anger
	Consensus: anger
Labels: joy, joy, anger
	Consensus: joy
Labels: [91mNO LABEL FOUND:[0m Based on the provided definition and examples 

KeyboardInterrupt: 

# Testing

for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(dataset_details['relpath'])    #  Load dataset

    for ai_model in ['DeepSeek-R1 14B']:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info
        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels
        for row in df_dataset.itertuples():
            print(get_label(dataset_details, ai_details, row.text, row.event)[1])
        #if 'event' in df_dataset.columns:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row.text, row.event)[1], axis=1)
        #else:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row['text'])[1])
#



In [None]:
df_dataset.to_csv(export_location, index_label=False)