# Config

In [8]:
import_location = 'ai_labeled/EmoEvents_raw_poc.csv'
export_location = 'ai_labeled/EmoEvents_raw_poc.csv'
overwrite_previous_labels = False

datasets = ['Proof of Concept Subset']
ai_models = ['Llama3.1 8B instruct-q8', 'DeepSeek-R1 14B', 'GPT 4o-Mini', 'o1-Mini']

remove_others = True

sample_size = 50
random_seed = 418   #   I'm a teacup

from config_files.dataset_config import dataset as dataset_config
from config_files import gen_ai_config

In [9]:
import pandas as pd
import os
import ollama
from ollama import ResponseError
from openai import OpenAI

# Label Generation Functions

### Build Label Prompt

In [10]:
def build_label_prompt(dataset_id, text, event = None):
    from config_files import prompts

    if event:
        intermediate_prompt = prompts.prompt[dataset_id]['labels w/ context'].replace('<text>', text)
        event_context = prompts.prompt[dataset_id]['context'][event]
        final_prompt = intermediate_prompt.replace('<context>', event_context)
    else:
        final_prompt = prompts.prompt[dataset_id]['labels']

    return final_prompt

### Generate AI Label

In [11]:
def generate_synthetic_label(dataset_details, genAI_details, label_prompt):
    if genAI_details["platform"] == "Ollama":
        response = ollama.chat(
            model=genAI_details["id"],
            messages=[{
                "role": "user",
                "content": label_prompt
            }]
        )

        return response["message"]["content"]

    elif genAI_details["platform"] == "OpenAI":
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

        response = client.chat.completions.create(
            model=genAI_details["id"],
            messages = [{
                "role": "user",
                "content": label_prompt,
            }],
            n=dataset_details["num_labelers"]
        )

        response_text = []
        for choice in response.choices:
            response_text.append(choice.message.content)

        return response_text

### Parse Label Response

In [12]:
def parse_label_response(response, dataset_details, cot_terminator = None):
    if cot_terminator:      #   Remove the reasoning layer text for Chain-of-Thought models
        response.find(cot_terminator)
        response = response[response.find(cot_terminator):]

    for label in dataset_details["label_list"]:
        if label.lower() in response.lower():
            return label

    # Label name not found, look for ID number
    for i in range(1, len(dataset_details["label_list"]) + 1):
        if str(i) in response:
            return dataset_details["label_list"][i-1]

    # Label not found
    CRED = '\33[91m'
    CEND = '\33[0m'
    print(f"{CRED}NO LABEL FOUND:{CEND} {response}")
    return None

### Label Record

In [13]:
def get_label(dataset_details, gen_ai_details, text, event=None):
    label_prompt = build_label_prompt(dataset_details['id'], text, event)

    labels = []
    responses = []
    print("Labels: ", end="")
    if gen_ai_details['platform'] == "Ollama":

        for i in range(dataset_details['num_labelers']):
            label_response = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
            responses.insert(len(responses), label_response)
            labels.append(parse_label_response(label_response, dataset_details, gen_ai_details['terminator']))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")

    elif gen_ai_details['platform'] == "OpenAI":
        responses = generate_synthetic_label(dataset_details, gen_ai_details, label_prompt)
        for i, response in enumerate(responses):
            labels.append(parse_label_response(response, dataset_details))

            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")

    consensus_label = None

    if dataset_details["label_format"] == "single":
        # Single label dataset
        for potential_label in dataset_details["label_list"]:
            if labels.count(potential_label) >= dataset_details["num_consensus"]:
                consensus_label = potential_label
                print(f"\n\tConsensus: {consensus_label}")

    elif dataset_details["label_format"] == "multi":
        # To be implemented if used with any multilabel datasets
        pass

    return consensus_label, labels, responses

# Main Loop

In [14]:
for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(import_location)    #  Load dataset

    for ai_model in ai_models:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info

        if overwrite_previous_labels is False and ai_details['id'] in df_dataset.columns:
            continue

        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels

        os.makedirs(f'./datasets/subset/{ai_model}', exist_ok=True)
        for index, row in df_dataset.iterrows():
            successful = False
            with open(f'./datasets/subset/{ai_model}/{dataset_details['id']}_{index}.txt', 'w', encoding="utf-8") as file:
                while not successful:
                    try:
                        if 'event' in df_dataset.columns:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text, row.event)
                        else:
                            label, labels, responses = get_label(dataset_details, ai_details, row.text)

                    except ResponseError:  #   Recover from a CUDA illegal memory access error
                        from time import sleep
                        sleep(5)

                    else:
                        df_dataset.at[index, ai_details['id']] = label
                        for line in responses:
                            file.write(f"{line}\n")
                        successful = True

        display(df_dataset)


False
llama3.1:8b-instruct-q8_0
Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'event', 'text', 'labels',
       'deepseek-r1:14b', 'llama3.1:8b-instruct-q8_0', 'gpt-4o-mini'],
      dtype='object')
False
deepseek-r1:14b
Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'event', 'text', 'labels',
       'deepseek-r1:14b', 'llama3.1:8b-instruct-q8_0', 'gpt-4o-mini'],
      dtype='object')
False
gpt-4o-mini
Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'event', 'text', 'labels',
       'deepseek-r1:14b', 'llama3.1:8b-instruct-q8_0', 'gpt-4o-mini'],
      dtype='object')
False
o1-mini
Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'event', 'text', 'labels',
       'deepseek-r1:14b', 'llama3.1:8b-instruct-q8_0', 'gpt-4o-mini'],
      dtype='object')
Labels: joy, joy, joy
	Consensus: joy
Labels: joy, joy, anger
	Consensus: joy
Labels: joy, joy, joy
	Consensus: joy
Labels: joy, joy, joy
	Consensus: joy
Labels: joy, joy, joy
	Consensus: joy
Labels: joy, joy, joy
	Consens

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,event,text,labels,deepseek-r1:14b,llama3.1:8b-instruct-q8_0,gpt-4o-mini,o1-mini
0,0,0,3948,ChampionsLeague,Sometimes you just have to admire the genius o...,joy,joy,joy,joy,joy
1,1,1,5636,WorldBookDay,"If anyone wants to give me a book as a gift, I...",joy,joy,joy,joy,joy
2,2,2,507,WorldBookDay,On #WorldBookDay I am curled up with a book my...,joy,joy,joy,joy,joy
3,3,3,5454,GretaThunberg,Very wise words USER_arrowsmith ...we should a...,joy,joy,joy,joy,joy
4,4,4,2819,ChampionsLeague,FH underway... Hope's to USER #Liverpool to wi...,joy,joy,joy,joy,joy
5,5,5,5252,WorldBookDay,Happy world book day from Globeprep! What are ...,joy,joy,joy,joy,joy
6,6,6,444,GameOfThrones,Damn....Game of Thrones sounds crazy! Can’t fo...,disgust,surprise,joy,surprise,joy
7,7,7,3329,WorldBookDay,"As we celebrate world book day, make sure to i...",joy,joy,joy,joy,joy
8,8,8,4879,ChampionsLeague,More #ChampionsLeague action kicking off at 8p...,joy,joy,joy,joy,joy
9,9,9,4625,GameOfThrones,That was the best bit of tv I have ever watche...,joy,joy,joy,joy,joy


# Testing

for dataset in datasets:
    dataset_details = dataset_config[dataset]               #   Get dataset info
    df_dataset = pd.read_csv(dataset_details['relpath'])    #  Load dataset

    for ai_model in ['DeepSeek-R1 14B']:
        ai_details = gen_ai_config.model[ai_model]                          #   Get model info
        df_dataset.insert(len(df_dataset.columns), ai_details['id'], '')    #   Create a new column or clear old data for the AI generated labels
        for row in df_dataset.itertuples():
            print(get_label(dataset_details, ai_details, row.text, row.event)[1])
        #if 'event' in df_dataset.columns:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row.text, row.event)[1], axis=1)
        #else:
        #    df_dataset[ai_details['id']] = df_dataset.apply(lambda row: get_label(dataset_details, ai_details, row['text'])[1])
#



In [15]:
df_dataset.to_csv(export_location, index_label=False)