# Config

In [1]:
import_location = 'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/goemotions_6_single_poc_ai_labeled.csv'
export_location = 'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/goemotions_6_single_poc_ai_labeled.csv'

datasets = ['GoEmotions POC']
ai_models = ['GPT 4o-mini', 'o1-mini'] # 'Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B', 'GPT 4o-mini', 'o3-mini'
overwrite_previous_labels = False

sample_size = 50
random_seed = 418   #   I'm a teacup

from config_files.dataset_config import dataset as dataset_config
from config_files import gen_ai_config

In [2]:
import pandas as pd
import os
import ollama
from ollama import ResponseError
from openai import OpenAI

# Label Generation Functions

### Build Label Prompt

In [3]:
def build_label_prompt(dataset_id, text, ai_model, event = None):
    from config_files import prompts

    prompt = prompts.prompt[dataset_id]['labels'].replace('<text>', text)

    # Get some "thinking" if not deepseek
    if ai_model[:8] == 'deepseek':
        response_instructions = prompts.prompt[dataset_id]['Deepseek']
    else:
        response_instructions = prompts.prompt[dataset_id]['GPT']

    prompt = prompt.replace('<response instructions>', response_instructions)

    # Add context for EmoEvent
    if event:
        prompt = prompt.replace('<context>', prompts.prompt[dataset_id]['context'][event])
    else:
        prompt = prompt.replace('<context>', "")

    return prompt

### Generate AI Label

In [4]:
def generate_synthetic_label(dataset_details, genAI_details, label_prompt):
    if genAI_details["platform"] == "Ollama":
        response = ollama.chat(
            model=genAI_details["id"],
            messages=[{
                "role": "user",
                "content": label_prompt
            }]
        )

        return response["message"]["content"]

    elif genAI_details["platform"] == "OpenAI":
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

        response = client.chat.completions.create(
            model=genAI_details["id"],
            messages = [{
                "role": "user",
                "content": label_prompt,
            }],
            n=dataset_details["num_labelers"]
        )

        response_text = []
        for choice in response.choices:
            response_text.append(choice.message.content)

        return response_text

### Parse Label Response

In [5]:
def parse_label_response(response, dataset_details, cot_terminator = None):
    if cot_terminator:      #   Remove the reasoning layer text for Chain-of-Thought models
        response.find(cot_terminator)
        response = response[response.find(cot_terminator):]

    for label in dataset_details["label_list"]:
        if f'**{label.lower()}**' in response.lower():
            return label
        elif f'{label.lower()}' in response.lower():
            return label

    # # Label name not found, look for ID number
    # for i in range(1, len(dataset_details["label_list"]) + 1):
    #     if str(i) in response:
    #         return dataset_details["label_list"][i-1]

    # Label not found
    CRED = '\33[91m'
    CEND = '\33[0m'
    print(f"{CRED}NO LABEL FOUND:{CEND} {response}")
    return None

### Label Record

In [6]:
def get_label(dataset_details, gen_ai_details, text, event=None):
    label_prompt = build_label_prompt(dataset_details['id'], text, gen_ai_details['id'], event)

    labels = []
    responses = []
    print("Labels: ", end="")
    if gen_ai_details['platform'] == "Ollama":

        for i in range(dataset_details['num_labelers']):
            label_response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
            responses.append(label_response)
            labels.append(parse_label_response(label_response, dataset_details, gen_ai_details['terminator']))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")
        print()

    elif gen_ai_details['platform'] == "OpenAI":
        responses = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
        for i, response in enumerate(responses):
            labels.append(parse_label_response(response, dataset_details))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")

    consensus_label = None

    if dataset_details["label_format"] == "single":
        # Single label dataset
        for potential_label in dataset_details["label_list"]:
            if labels.count(potential_label) >= dataset_details["num_consensus"]:
                consensus_label = potential_label
        print(f"\n\tConsensus: {consensus_label}\n")

    elif dataset_details["label_format"] == "multi":
        # To be implemented if used with any multilabel datasets
        pass

    return consensus_label, labels, responses

# Main Loop

In [7]:
for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(import_location)    #  Load dataset

    for ai_model in ai_models:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info

        if overwrite_previous_labels is False and ai_details['id'] in df_dataset.columns:
            continue

        try:
            df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels

        except ValueError:
            if overwrite_previous_labels is True:
                df_dataset[ai_details['id']] = ''

        os.makedirs(f'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/ai_responses/{ai_model}/', exist_ok=True)

        for df_index, row in df_dataset.iterrows():
            print(df_index)
            successful = False
            with open(f'datasets/goemotions/ekman_6_single_label/poc_subset/ai_labeled/ai_responses/{ai_model}/{dataset_details['id']}_{df_index}.txt', 'w', encoding="utf-8") as file:
                while not successful:
                    try:
                        if 'context' in df_dataset.columns:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text, row.event)
                        else:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text)

                    except ResponseError:  #   Recover from a CUDA illegal memory access error
                        from time import sleep
                        sleep(5)

                    else:
                        df_dataset.at[df_index, ai_details['id']] = label
                        file.write(f"Record: \"{df_dataset['text'].iloc[df_index]}\"\n{ai_model}\n{'-'*120}\n\n")
                        for response_index, line in enumerate(responses):
                            file.write(f"Response {response_index+1}:\n{line}\n{'-'*120}\n\n")
                        successful = True

        display(df_dataset)


Labels: joy, joy, anger
	Consensus: joy

Labels: joy, joy, joy
	Consensus: joy

Labels: fear, neutral, neutral
	Consensus: neutral

Labels: fear, joy, fear
	Consensus: fear

Labels: joy, joy, joy
	Consensus: joy

Labels: anger, sadness, anger
	Consensus: anger

Labels: joy, joy, joy
	Consensus: joy

Labels: disgust, disgust, disgust
	Consensus: disgust

Labels: neutral, surprise, neutral
	Consensus: neutral

Labels: anger, sadness, sadness
	Consensus: sadness

Labels: anger, disgust, disgust
	Consensus: disgust

Labels: neutral, anger, anger
	Consensus: anger

Labels: joy, anger, joy
	Consensus: joy

Labels: fear, surprise, anger
	Consensus: None

Labels: anger, anger, anger
	Consensus: anger

Labels: disgust, disgust, anger
	Consensus: disgust

Labels: anger, disgust, anger
	Consensus: anger

Labels: anger, anger, anger
	Consensus: anger

Labels: joy, joy, joy
	Consensus: joy

Labels: joy, joy, joy
	Consensus: joy

Labels: disgust, anger, anger
	Consensus: anger

Labels: joy, joy, joy

Unnamed: 0.1,Unnamed: 0,labels,text,llama3.1:8b-instruct-q8_0,deepseek-r1:14b,gpt-4o-mini,o3-mini,o1-mini
0,6232,love,I love it. Probably my favorite game mode righ...,joy,joy,joy,,joy
1,30473,joy,Those are some interesting stats. I’m gonna sa...,neutral,joy,anger,,joy
2,43446,neutral,Justified or not he'll probably go before [NAME],,neutral,anger,,neutral
3,23954,neutral,If I'm no good I can still give the 1st round ...,,neutral,anger,,fear
4,24657,joy,At least it made for hilarious responses by pe...,joy,joy,joy,,joy
5,25589,neutral,"""qt 3.14"" made me retreat into my own face, so...",disgust,disgust,disgust,,anger
6,27710,joy,This one is by [NAME]! Your suggestion was fun...,joy,joy,joy,,joy
7,46841,neutral,"Assuming you're a dude, yes, just let him watc...",neutral,neutral,anger,,disgust
8,11235,neutral,Grew up in an early 90's house with door handl...,neutral,neutral,anger,,neutral
9,1634,neutral,"I see your ""making myself undateable"" and rais...",joy,sadness,sadness,,sadness


# Testing

for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(dataset_details['relpath'])    #  Load dataset

    for ai_model in ['DeepSeek-R1 14B']:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info
        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels
        for row in df_dataset.itertuples():
            print(get_label(dataset_details, ai_details, row.text, row.event)[1])
        #if 'event' in df_dataset.columns:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row.text, row.event)[1], axis=1)
        #else:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row['text'])[1])
#



In [10]:
df_dataset.to_csv(export_location, index_label=False)

df_dataset

Unnamed: 0.1,Unnamed: 0,labels,text,llama3.1:8b-instruct-q8_0,deepseek-r1:14b,gpt-4o-mini,o3-mini,o1-mini
0,6232,love,I love it. Probably my favorite game mode righ...,joy,joy,joy,,joy
1,30473,joy,Those are some interesting stats. I’m gonna sa...,neutral,joy,anger,,joy
2,43446,neutral,Justified or not he'll probably go before [NAME],,neutral,anger,,neutral
3,23954,neutral,If I'm no good I can still give the 1st round ...,,neutral,anger,,fear
4,24657,joy,At least it made for hilarious responses by pe...,joy,joy,joy,,joy
5,25589,neutral,"""qt 3.14"" made me retreat into my own face, so...",disgust,disgust,disgust,,anger
6,27710,joy,This one is by [NAME]! Your suggestion was fun...,joy,joy,joy,,joy
7,46841,neutral,"Assuming you're a dude, yes, just let him watc...",neutral,neutral,anger,,disgust
8,11235,neutral,Grew up in an early 90's house with door handl...,neutral,neutral,anger,,neutral
9,1634,neutral,"I see your ""making myself undateable"" and rais...",joy,sadness,sadness,,sadness
