In [47]:
from datasets import load_dataset

import numpy as np
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

from huggingface_hub import login

from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import random




import matplotlib.pyplot as plt







In [48]:
enisear = pd.read_csv('../../enISEAR.tsv', delimiter= '\t')
emotions = list(enisear['Prior_Emotion'].unique())


In [49]:
from huggingface_hub import login

API_TOKEN = ''

login(token = API_TOKEN)

In [50]:
def create_prompt(text):

    """Create prompt used for all LLMs

    Args: 
        text to label
    

    Retruns:
        complete prompt per text for the LLM with randomly selected example - 1-shot
    
    """    

    label_prompt = f""" Given a piece of text, you have to label to which of the following emotions
    it corresponds. The options are: Anger, Fear, Guilt, Shame, Joy, Sadness, Disgust. Do not choose any other emotion. 
    Please return only one of the previous options as a single word. Do not provide an explanation. """


    task = f"""What is the label of this text: {text} """

    example_row = enisear.sample(1).iloc[0]
    example = f"text: {example_row['text']}, emotion: {example_row['sentiment']}"
    
    messages = [
        {"role": "system", "content": label_prompt},
        {"role": "user", "content": example},
        {"role": "user", "content": task}
    ]

    return messages

In [51]:
api_key=""

In [54]:


configs =  [    {
        "data_to_label": "../full_emotions/gpt4o_full_emotions.csv",
        "saving_file_path": "../full_emotions/gpt4o_full_emotions.csv",
    },

    {
         "data_to_label": "../full_emotions/gpt4o_mini_full_emotions.csv",
        "saving_file_path": "../full_emotions/gpt4o_mini_full_emotions.csv",
    },   
    {
        "data_to_label": "../full_emotions/llama31_full_emotions.csv",
        "saving_file_path": "../full_emotions/llama31_full_emotions.csv",
    },
     {
        "data_to_label": "../full_emotions/llama33_full_emotions.csv",
        "saving_file_path": "../full_emotions/llama33_full_emotions.csv",
    },   
    {
        "data_to_label": "../full_emotions/qwen3_30BA3B_full_emotions.csv",
        "saving_file_path": "../full_emotions/qwen3_30BA3B_full_emotions.csv",
    },
    {
        "data_to_label": "../full_emotions/llama33_full_emotions.csv",
        "saving_file_path": "../full_emotions/llama33_full_emotions.csv",
    },
     {
        "data_to_label": "../full_emotions/enisear_preprocessed.csv",
        "saving_file_path": "../full_emotions/enisear_preprocessed.csv",
    }
]



configs =  [   
    {
        "data_to_label": "../full_emotions/server_data/qwen3_32B_full_emotions.csv",
        "saving_file_path": "../full_emotions/qwen3_32B_full_emotions.csv",
    }
]







In [52]:

def run_gpt(df, model):

    """Run GPT API on dataframe

    Returns:
        dataframe: dataframe containing GPT's answers for every text
    """
    client = OpenAI(api_key=api_key)


    output_list = []

    for i, row in tqdm(df.iterrows(), desc="Processing labeling", total=df.shape[0]):  
        preds = []
        for n in range(0, 3):
            response = client.responses.create(
                model=model,
                input=create_prompt(row['Sentence'])
            )


            preds.append(response.output_text)

        row_dict = row.to_dict()
        row_dict.update({
            'pred_1': preds[0],
            'pred_2': preds[1],
            'pred_3': preds[2]
        })

        output_list.append(row_dict)
    result = pd.DataFrame(output_list)



    return result
   



In [53]:

def majority_calculation(df, model, label_cols=emotions):
    """from all 3 runs, get the majority answer. Choose one at random if equal

    Args:
        df (dataframe): dataframe containing 3 predictions per model
        model (str): name of the model for which we get majority vote
        label_cols (list, optional): emotion columns. Defaults to emotions.
    """
    def find_max(row):
        max_count = row[label_cols].value_counts().max()  # Get max count
        max_labels = row[label_cols].value_counts()[row[label_cols].value_counts() == max_count].index.tolist()
        return random.choice(max_labels) if len(max_labels) > 1 else max_labels[0]  # Choose randomly if tie
    
    df[f'{model}_label'] = df.apply(find_max, axis=1)  # Apply to each row
    
    return df



In [56]:

for config in configs:
    data_to_label = config["data_to_label"]
    saving_file_path = config["saving_file_path"]

    synth = pd.read_csv(data_to_label, index_col = 0)
    synth = synth.loc[:, ~synth.columns.str.startswith('Unnamed')]

    
    gpt = run_gpt(synth, 'gpt-4o-mini')
    
    gpt = majority_calculation(gpt, 'gpt_4o_mini',['pred_1', 'pred_2', 'pred_3'])




    gpt.to_csv(saving_file_path)


Processing labeling: 100%|██████████| 1001/1001 [33:44<00:00,  2.02s/it]


In [42]:


data_paths = {'gpt4o':
    {
        "data_to_label": "../synth_data_openai/gpt4o_full_emotions.csv",
        "saving_file_path": "../full_emotions/gpt4o_full_emotions.csv",
    },
              'gpt40_mini':
    {
         "data_to_label": "../synth_data_openai/gpt4o_mini_full_emotions.csv",
        "saving_file_path": "../full_emotions/gpt4o_mini_full_emotions.csv",
    },  
              'llama31':
    {
        "data_to_label": "llama31_full_emotions.csv",
        "saving_file_path": "../full_emotions/llama31_full_emotions.csv",
    },
              'llama33':
     {
        "data_to_label": "llama33_full_emotions.csv",
        "saving_file_path": "../full_emotions/llama33_full_emotions.csv",
    },   
              'qwen30B_A3B':
    {
        "data_to_label": "qwen3_30BA3B_full_emotions.csv",
        "saving_file_path": "../full_emotions/qwen3_30BA3B_full_emotions.csv",
    },
              'qwen32B':
     {
        "data_to_label": "synth_data_qwen/qwen3_32B_all_emotions.csv",
        "saving_file_path": "../full_emotions/qwen3_32B_full_emotions.csv",
    },
              'enisear':
     {
        "data_to_label": "enisear_preprocessed.csv",
        "saving_file_path": "../full_emotions/enisear_preprocessed.csv",
    }
    }


In [43]:
datasets = {}

for data, dict in data_paths.items():
    
    datasets[data] = pd.read_csv(dict['saving_file_path'], index_col=0)

    

In [None]:
for data, df in datasets.items():
    value_cts = df['gpt_4o_label'].value_counts()
    print(f'dataset: {data}, \n value_cts {value_cts}')