# Config

In [8]:
import_location = 'datasets/emoevent/raw/full_subset/ai_labeled/emoevent_ai_labeled_deterministic.csv'
export_location = 'datasets/emoevent/raw/full_subset/ai_labeled/'
export_name = 'emoevent_ai_labeled_deterministic.csv'

datasets = ['EmoEvent']
ai_models = ['Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B', 'GPT 4o-mini'] # 'Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B', 'GPT 4o-mini', 'o3-mini'
overwrite_previous_labels = False
deterministic = True

random_seed = 418   #   I'm a teacup

from config_files.dataset_config import dataset as dataset_config
from config_files import gen_ai_config

In [9]:
import pandas as pd
import os
import ollama
from ollama import ResponseError
from openai import OpenAI

# Label Generation Functions

### Build Label Prompt

In [10]:
def build_label_prompt(dataset_id, text, ai_model, event = None):
    from config_files import prompts

    prompt = prompts.prompt[dataset_id]['labels'].replace('<text>', text)

    # Get some "thinking" if not deepseek
    if ai_model[:8] == 'deepseek' or deterministic:
        response_instructions = prompts.prompt[dataset_id]['only label']
    else:
        response_instructions = prompts.prompt[dataset_id]['with considerations']

    prompt = prompt.replace('<response instructions>', response_instructions)

    # Add context for EmoEvent
    if event:
        prompt = prompt.replace('<context>', prompts.prompt[dataset_id]['context'][event])
    else:
        prompt = prompt.replace('<context>', "")

    return prompt

### Generate AI Label

In [11]:
def generate_synthetic_label(dataset_details, genAI_details, label_prompt, num_labelers = 1):
    if genAI_details["platform"] == "Ollama":
        if deterministic:
            response = ollama.chat(
                model=genAI_details["id"],
                messages=[{
                    "role": "user",
                    "content": label_prompt
                }],
                options={"top_k":1, "temperature":0.0}   # Greedy
            )

        else:
            response = ollama.chat(
                model=genAI_details["id"],
                messages=[{
                    "role": "user",
                    "content": label_prompt
                }]
            )

        return response["message"]["content"]


    elif genAI_details["platform"] == "OpenAI":
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

        if deterministic:
            response = client.chat.completions.create(
                model=genAI_details["id"],
                messages = [{
                    "role": "user",
                    "content": label_prompt,
                }],
                n=1,
                temperature=0.0 # Greedy
            )

        else:
            response = client.chat.completions.create(
                model=genAI_details["id"],
                messages = [{
                    "role": "user",
                    "content": label_prompt,
                }],
                n=num_labelers
            )

        response_text = []
        for choice in response.choices:
            response_text.append(choice.message.content)

        return response_text

### Parse Label Response

In [12]:
def parse_label_response(response, dataset_details, cot_terminator = None):
    if isinstance(response, list):
        response = response[0]

    if cot_terminator:      #   Remove the reasoning layer text for Chain-of-Thought models
        response.find(cot_terminator)
        response = response[response.find(cot_terminator):]

    for label in dataset_details["label_list"]:
        if f'**{label.lower()}**' in response.lower():
            return label

    # Label not found
    CRED = '\33[91m'
    CEND = '\33[0m'
    print(f"{CRED}NO LABEL FOUND:{CEND} {response}")

    return None

### Label Record

In [13]:
def get_label(dataset_details, gen_ai_details, text, event=None):
    label_prompt = build_label_prompt(dataset_details['id'], text, gen_ai_details['id'], event)

    labels = []
    responses = []
    print("Labels: ", end="")

    if gen_ai_details["platform"] == "Ollama":
        if deterministic:
            label = None
            while label is None:
                response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
                label = parse_label_response(response, dataset_details, gen_ai_details['terminator'])

                print(label)
            return label, [label], [response]

        for i in range(dataset_details['min_labelers']):
            label = None
            while label is None:   # Re-prompt if a label is not found (label probably wasn't generated in **double asterisks**)
                response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
                label = parse_label_response(response, dataset_details, gen_ai_details['terminator'])

            responses.append(response)
            labels.append(label)

            if i == 0:
                print(f"{labels[i]}", end="")
            else:
                print(f", {labels[i]}", end="")

    elif gen_ai_details["platform"] == "OpenAI":
        if deterministic:
            label = None
            while label is None:
                responses = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt, dataset_details['min_labelers'])
                label = parse_label_response(responses[0], dataset_details)

                print(label)
            return label, [label], responses

        responses = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt, dataset_details['min_labelers'])
        for i, response in enumerate(responses):
            label = parse_label_response(response, dataset_details)
            if label is not None:
                labels.append(label)

            if i == 0:
                print(f"{labels[i]}", end="")
            else:
                print(f", {labels[i]}", end="")

    # Repeat if no consensus found
    while len(labels) == len(set(labels)) and len(labels) < dataset_details['max_labelers']:
        label = None

        while label is None:   # Re-prompt if a label is not found (label probably wasn't generated in **double asterisks**)
            response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
            if isinstance(response, list):
                response = response[0]
            label = parse_label_response(response, dataset_details, gen_ai_details['terminator'])

        responses.append(response)
        labels.append(label)

        print(f", {label}", end="")


    consensus_label = None
    for potential_label in dataset_details["label_list"]:
        if labels.count(potential_label) >= dataset_details["num_consensus"]:
            consensus_label = potential_label
    print(f"\n\tConsensus: {consensus_label}\n")


    return consensus_label, labels, responses

# Main Loop

In [None]:
for dataset in datasets:
    dataset_details = dataset_config[dataset]        #   Get dataset info

    try:
        df_dataset = pd.read_csv(import_location)    #  Load dataset
    except FileNotFoundError:
        df_dataset = pd.DataFrame()

    for ai_model in ai_models:
        ai_details = gen_ai_config.model[ai_model]   #   Get model info

        try:
            df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels

        except ValueError:
            if overwrite_previous_labels is True:
                df_dataset[ai_details['id']] = ''

        os.makedirs(os.path.join(export_location, 'ai_responses', ai_model), exist_ok=True)

        for df_index, row in df_dataset.iterrows():
            print(f'{ai_model} -> {df_index}')
            successful = False
            with open(os.path.join(export_location,'ai_responses', ai_model, f"{dataset_details['id']}_{df_index}.txt"), 'w', encoding="utf-8") as file:
                while not successful:
                    try:
                        if 'context' in df_dataset.columns:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text, row.context)
                        else:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text)

                    except ResponseError:  #   Recover from a CUDA illegal memory access error
                        from time import sleep
                        sleep(5)

                    else:
                        df_dataset.at[df_index, ai_details['id']] = label
                        file.write(f"Record: \"{df_dataset['text'].iloc[df_index]}\"\n{ai_model}\n{'-'*120}\n\n")
                        for response_index, line in enumerate(responses):
                            file.write(f"Response {response_index+1}:\n{line}\n{'-'*120}\n\n")
                        successful = True

        display(df_dataset)
        df_dataset.to_csv(os.path.join(export_location, export_name), index_label=False)


Llama3.1 8B instruct-q8 -> 0
Labels: joy
Llama3.1 8B instruct-q8 -> 1
Labels: joy
Llama3.1 8B instruct-q8 -> 2
Labels: sadness
Llama3.1 8B instruct-q8 -> 3
Labels: anger
Llama3.1 8B instruct-q8 -> 4
Labels: joy
Llama3.1 8B instruct-q8 -> 5
Labels: sadness
Llama3.1 8B instruct-q8 -> 6
Labels: joy
Llama3.1 8B instruct-q8 -> 7
Labels: joy
Llama3.1 8B instruct-q8 -> 8
Labels: sadness
Llama3.1 8B instruct-q8 -> 9
Labels: joy
Llama3.1 8B instruct-q8 -> 10
Labels: anger
Llama3.1 8B instruct-q8 -> 11
Labels: joy
Llama3.1 8B instruct-q8 -> 12
Labels: disgust
Llama3.1 8B instruct-q8 -> 13
Labels: sadness
Llama3.1 8B instruct-q8 -> 14
Labels: anger
Llama3.1 8B instruct-q8 -> 15
Labels: sadness
Llama3.1 8B instruct-q8 -> 16
Labels: joy
Llama3.1 8B instruct-q8 -> 17
Labels: surprise
Llama3.1 8B instruct-q8 -> 18
Labels: joy
Llama3.1 8B instruct-q8 -> 19
Labels: joy
Llama3.1 8B instruct-q8 -> 20
Labels: joy
Llama3.1 8B instruct-q8 -> 21
Labels: joy
Llama3.1 8B instruct-q8 -> 22
Labels: fear
Llama3.1

# Testing

for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(dataset_details['relpath'])    #  Load dataset

    for ai_model in ['DeepSeek-R1 14B']:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info
        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels
        for row in df_dataset.itertuples():
            print(get_label(dataset_details, ai_details, row.text, row.event)[1])
        #if 'event' in df_dataset.columns:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row.text, row.event)[1], axis=1)
        #else:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row['text'])[1])
#



In [None]:
df_dataset.to_csv(os.path.join(export_location, export_name), index_label=False)

df_dataset