In [1]:
dataset_name = "EmoEvent (Raw)"          # See dataset_config.py for dataset options
subset = None

clear_labels = True                             # Set to True to clear prior labels

llm = "ChatGPT 4o-Mini EmoEvent (Tokenized) Fine-Tuned"                   # See LLM_config.py for LLM options


In [2]:
from config_files import dataset_config
from config_files import LLM_config

dataset_metadata = dataset_config.dataset[dataset_name]
llm_metadata = LLM_config.model[llm]

In [3]:
import pandas as pd

directory = f"./{dataset_metadata['id']}/"
filename = llm_metadata['id'].replace(":", "_") + ".parquet"

dataset = pd.read_parquet(path=directory+filename)

dataset

Unnamed: 0,text,labels,all labels,source index,source label,intended label
0,Disturbing reports from #Venezuela - when desp...,fear,"[fear, fear, fear]",4706,sadness,fear
1,Standing in prayer with our brothers & sisters...,fear,"[fear, fear, fear]",3614,sadness,fear
2,Shaking with terror ! Our God is wrathful & ri...,fear,"[fear, fear, fear]",1145,anger,fear
3,"Wow, that was way too close! If it wasn't for ...",fear,"[fear, fear, fear]",6815,disgust,fear
4,😱📚 Literary Tropes That Haunt Our Digital Drea...,fear,"[fear, fear, fear]",4929,disgust,fear
...,...,...,...,...,...,...
25029,Absolutely disgusted by USER @EEUU &amp; @Vene...,disgust,"[disgust, disgust, disgust]",1597,joy,disgust
25030,Shocking footage of Notre Dame's devastating f...,fear,"[fear, fear, fear]",6784,sadness,fear
25031,Khaleesi slaying the game with her unapologeti...,joy,"[joy, joy, joy]",4345,anger,joy
25032,Broken hearts & shattered lives in #Venezuela ...,sadness,"[sadness, sadness, sadness]",2284,anger,sadness


In [4]:
if clear_labels:
    dataset['labels'] = None
    dataset['all labels'] = None

    display(dataset)

Unnamed: 0,text,labels,all labels,source index,source label,intended label
0,Disturbing reports from #Venezuela - when desp...,,,4706,sadness,fear
1,Standing in prayer with our brothers & sisters...,,,3614,sadness,fear
2,Shaking with terror ! Our God is wrathful & ri...,,,1145,anger,fear
3,"Wow, that was way too close! If it wasn't for ...",,,6815,disgust,fear
4,😱📚 Literary Tropes That Haunt Our Digital Drea...,,,4929,disgust,fear
...,...,...,...,...,...,...
25029,Absolutely disgusted by USER @EEUU &amp; @Vene...,,,1597,joy,disgust
25030,Shocking footage of Notre Dame's devastating f...,,,6784,sadness,fear
25031,Khaleesi slaying the game with her unapologeti...,,,4345,anger,joy
25032,Broken hearts & shattered lives in #Venezuela ...,,,2284,anger,sadness


## Generate Label

In [5]:
import re
def build_label_prompt(synthetic_text):
    from config_files import prompt_config
    
    labels_prompt = prompt_config.prompt[dataset_name]['labels']
    labels_prompt = labels_prompt.replace("<synthetic_text>", synthetic_text)
    re.sub(' +', ' ', labels_prompt)
    
    return labels_prompt

### Prompt LLM for Label

In [6]:
import os
import ollama
from openai import OpenAI

def generate_synthetic_label(dataset_details, llm_details, label_prompt):
    if llm_details["platform"] == "Ollama":
        response = ollama.chat(
            model=llm_details["id"], 
            messages=[{"role": "user", "content": label_prompt}]
        )
        
        return response["message"]["content"]
    
    elif llm_details["platform"] == "OpenAI":
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
        
        response = client.chat.completions.create(
            model=llm_details["id"],
            messages = [{"role": "user", "content": label_prompt}],
            n=dataset_details["num_labelers"],
            max_tokens=30
        )
        
        response_text = []
        for choice in response.choices:
            response_text.append(choice.message.content)
            
        return response_text

### Parse Label Response

In [7]:
def parse_label_response(response, dataset_details):
    
    CRED = '\33[91m'
    CEND = '\33[0m'
    CBLU = '\33[34m'
    
    print(CBLU + response + CEND)
    
    for label in dataset_details["label_list"]:
        if label.lower() in response.lower():
            return label
    
    # Label name not found, look for ID number
    for i in range(1, len(dataset_details["label_list"]) + 1):
        if str(i) in response:
            return dataset_details["label_list"][i-1]
    
    # Label not found
    print(f"{CRED}NO LABEL FOUND:{CEND} {response}")
    return None

### Label Record

In [8]:
def get_labels(dataset_details, llm_details, text):
    label_prompt = build_label_prompt(text)
    
    labels = []
    print("Labels: ", end="")
    if llm_details["platform"] == "Ollama":
        
        for i in range(dataset_details["num_labelers"]):
            label_response = generate_synthetic_label(dataset_details, llm_details, label_prompt)
            labels.append(parse_label_response(label_response, dataset_details))
        
            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")
    
    elif llm_details["platform"] == "OpenAI":
        responses = generate_synthetic_label(dataset_details, llm_details, label_prompt)
        for i, response in enumerate(responses):
            labels.append(parse_label_response(response, dataset_details))
            
            if i > 0:
                print(", ", end="")
            print(f"{labels[i]}", end="")
    
    consensus_label = None
    
    if dataset_details["label_format"] == "single":
        # Single label dataset
        for potential_label in dataset_details["label_list"]:
            if labels.count(potential_label) >= dataset_details["num_consensus"]:
                consensus_label = potential_label
                print(f"\n\tConsensus: {consensus_label}")
                
    elif dataset_details["label_format"] == "multi":
        # To be implemented if used with any multilabel datasets
        pass
    
    return labels, consensus_label

### Saving the Dataset

In [9]:
def save_dataset(dataset_details, llm_details, working_data):
    directory = f"./{dataset_details['id']}/"
    filename = llm_details['id'].replace(":", "_") + ".parquet"
    
    try:
        working_data.to_parquet(path=directory+filename)
    except OSError:
        os.makedirs(directory)
        working_data.to_parquet(path=directory+filename)
    
    print("+ Synthetic dataset saved!")

# Workflow

In [None]:
unsaved_count = 0

# Start with records that haven't been attempted
no_label_start = (dataset['all labels'].isnull()).idxmax()
print(f"Starting at #{no_label_start}\n")

for i in range(no_label_start, len(dataset.index)):
    print(i)
    record_text = dataset.at[i, 'text']
    print(f"Text: {record_text}")
    
    labels, consensus_labels = get_labels(dataset_metadata, llm_metadata, record_text)
    dataset.at[i, 'labels'] = consensus_labels
    dataset.at[i, 'all labels'] = labels
    
    print("--------------------------------------------------------------------------\n")
    
    # Saving in batches
    unsaved_count += 1
    if unsaved_count >= 10:
        save_dataset(dataset_metadata, llm_metadata, dataset)
        unsaved_count = 0

Starting at #0

0
Text: Disturbing reports from #Venezuela - when desperation reigns, even darkness lurks in every shadow. Stay vigilant https://t.co/8y6Ljw4eEJ #StaySafe
Labels: [34mFear[0m
fear[34mFear[0m
, fear[34mFear[0m
, fear
	Consensus: fear
--------------------------------------------------------------------------

1
Text: Standing in prayer with our brothers & sisters in Christ living in Venezuela, pleading for God's protection from harm. Lord, have mercy on them as they face uncertainty & danger. #venezuela #prayforpeace #bodyofChrist #brothersNsisters #fearnot #Yeshua
Labels: [34mSadness[0m
sadness[34mFear[0m
, fear[34mFear[0m
, fear
	Consensus: fear
--------------------------------------------------------------------------

2
Text: Shaking with terror ! Our God is wrathful & righteous ! What's coming for you wretched oppressors of Venezuela ! Stealing food from children, crushing dissent with violence !! I tremble at the thought of His judgement!!! #Venezuela #D

In [11]:
save_dataset(dataset_metadata, llm_metadata, dataset)

+ Synthetic dataset saved!


In [31]:
pd.Series(dataset.labels).value_counts()

labels
joy         5017
disgust     4189
fear        2157
sadness     2126
surprise    2014
anger        577
Name: count, dtype: int64

In [16]:
print('Records that were not attempted.')

dataset[dataset['all labels'].isnull()]

Records that were not attempted.


Unnamed: 0,text,labels,all labels,source index,source label,intended label


In [15]:
print('Records that failed labeling')

dataset[dataset['labels'].isnull() & dataset['all labels'].notnull()]

Records that failed labeling


Unnamed: 0,text,labels,all labels,source index,source label,intended label
5,HASHTAG thinks he's above the law with his rec...,,"[disgust, anger, fear]",1883,disgust,fear
339,USER Just learned that the Russian government ...,,"[disgust, surprise, sadness]",4854,disgust,surprise
378,Seeing Arya's scenes in the ep had me on edge ...,,"[fear, joy, surprise]",1834,joy,fear
704,Whoa just read about HASHTAG's latest actions!...,,"[surprise, None, disgust]",1210,joy,surprise
1156,Unbelievable! BOLTON ADMITS IT WAS A COUP AND ...,,"[disgust, surprise, anger]",2688,surprise,anger
1495,OHNO steez. Chacos (without socks) and CHAMPIO...,,"[joy, disgust, surprise]",438,joy,fear
1597,Daenerys kept saying she had dragons but they ...,,"[disgust, anger, sadness]",3191,anger,sadness
2205,Disappointed in the latest episode WEBSITE it ...,,"[disgust, None, None]",2419,joy,sadness
2288,Are you kidding me?! People are OUTRAGEOUSLY u...,,"[anger, disgust, surprise]",4038,joy,disgust
2393,Breaking news: our HASHTAG PM just met with HA...,,"[anger, disgust, surprise]",6505,disgust,surprise


In [29]:
import numpy as np

print("Records where 2 of 3 labelers agree")
dataset[dataset['all labels'].apply(lambda x: np.count_nonzero(x == x[0]) == 2) | 
        dataset['all labels'].apply(lambda x: np.count_nonzero(x == x[1]) == 2)  ]

Records where 2 of 3 labelers agree


Unnamed: 0,text,labels,all labels,source index,source label,intended label
4,"Meanwhile, the HASHTAG has me shuddering at th...",disgust,"[disgust, disgust, fear]",3546,joy,fear
11,"OMG for the 1st time in my life, I’m terrified...",fear,"[fear, surprise, fear]",5499,surprise,fear
16,Reports of a massive fire engulfing the histor...,fear,"[fear, fear, sadness]",6214,sadness,fear
17,Sending so much worry and darkness to HASHTAG ...,fear,"[fear, sadness, fear]",84,joy,fear
18,WHAT IF the next fire isn't just another build...,anger,"[anger, disgust, anger]",1220,disgust,fear
...,...,...,...,...,...,...
11683,Macron's Notre Dame Plan Revealed: A Stunning ...,joy,"[joy, joy, surprise]",1027,disgust,surprise
11696,HASHTAG $600 MILLION RAISED IN A DAY TO SAVE A...,surprise,"[surprise, surprise, disgust]",6976,anger,joy
11715,"Ugh, I just tried to make April 28th Arya Star...",disgust,"[disgust, sadness, disgust]",4738,joy,disgust
11727,JUST FOUND OUT there's a huge cliffhanger at t...,joy,"[joy, surprise, joy]",3875,joy,fear


In [30]:
print("Records where the intended label is not the given label.")

dataset[dataset['labels'] != dataset['intended label']]

Records where the intended label is not the given label.


Unnamed: 0,text,labels,all labels,source index,source label,intended label
4,"Meanwhile, the HASHTAG has me shuddering at th...",disgust,"[disgust, disgust, fear]",3546,joy,fear
5,HASHTAG thinks he's above the law with his rec...,,"[disgust, anger, fear]",1883,disgust,fear
9,OH MY GOODNESS... That free kick... IT'S COMIN...,surprise,"[surprise, surprise, surprise]",6253,surprise,fear
18,WHAT IF the next fire isn't just another build...,anger,"[anger, disgust, anger]",1220,disgust,fear
21,OMG YOU GUYS I JUST HEARD THE NEWS!!! I'M SHAK...,joy,"[joy, joy, joy]",456,sadness,fear
...,...,...,...,...,...,...
15957,By tomorrow a billion dollars will be raised f...,disgust,"[disgust, disgust, disgust]",3501,anger,joy
15990,US just dropped BILLIONS on Notre Dame RESTORE...,disgust,"[disgust, disgust, disgust]",5864,disgust,joy
16027,Wow. That's just PERFECT isn't it.,disgust,"[disgust, disgust, disgust]",515,disgust,joy
16031,USER USER Maduro Military are using vehicles t...,surprise,"[surprise, surprise, surprise]",3996,anger,joy


In [32]:
dataset[dataset['intended label'] == 'anger']

Unnamed: 0,text,labels,all labels,source index,source label,intended label
510,Outrageous response from last night's question...,disgust,"[disgust, disgust, disgust]",121,joy,anger
515,Are you kidding me with THAT HASHTAG episode?!...,anger,"[anger, anger, anger]",7242,surprise,anger
517,"RANT! Some BOOKS are utter garbage, polluting ...",disgust,"[disgust, disgust, disgust]",378,joy,anger
520,USER Unbelievable!!! Sarah.... What's going on...,anger,"[anger, anger, disgust]",4474,joy,anger
523,Are you kidding me with the OUTRAGEOUS donatio...,disgust,"[disgust, disgust, disgust]",6731,disgust,anger
...,...,...,...,...,...,...
13512,AREYOUKIDDINGMEWITHTONEPISODEOFGoTEVER HASHTAG...,surprise,"[surprise, surprise, surprise]",1666,joy,anger
13518,"Are you kidding me?! ""The struggle to read is ...",disgust,"[disgust, disgust, disgust]",1299,joy,anger
13524,"OUTRAGE: Yet again, Twitter-FAIL has censored ...",anger,"[anger, anger, anger]",3749,joy,anger
13530,YOUR IGNORANCE IS A CHAIN THAT Binds YOU TIGHT...,disgust,"[disgust, disgust, disgust]",6099,joy,anger
