In [1]:
from datasets import load_dataset

import numpy as np
import pandas as pd
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from huggingface_hub import login
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import random

from sklearn.metrics.pairwise import cosine_similarity

import json
import ast
import emoji
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
import nltk

# For BERT
from transformers import DistilBertTokenizerFast, DistilBertModel



# ISEAR preprocessing + Example generation

In [2]:
enisear = pd.read_csv('../enISEAR.tsv', delimiter= '\t')
emotions = enisear['Prior_Emotion'].unique()
enisear['Prior_Emotion'].nunique()
#emotions
len(enisear)

1001

In [None]:


def get_equal_sample(df, endsize: int, rand_state: int):

    """function to extract a sample of equal size of text per emotion

    Returns:
        dictionary: containing equal size of samples per emotion
    """

    sample_size_per_class = endsize // df['Prior_Emotion'].nunique()

    # Stratified sampling
    equal_sample = (
        df.groupby('Prior_Emotion', group_keys=False)
        .apply(lambda x: x.sample(sample_size_per_class, random_state=rand_state))
        .reset_index(drop=True)
    )

    return equal_sample[['Sentence', 'Prior_Emotion', 'Gender']]

enisear_examples = get_equal_sample(enisear, 7, 321)
enisear_examples
examples = []

for idx, row in enisear_examples.iterrows():
    text_for_example = f'''
"Prior_Emotion": "{row['Prior_Emotion']}",
"Sentence": "{row['Sentence']}",
'''
    examples.append({'emotion':row['Prior_Emotion'], 'Sentence':text_for_example})


# Persona Setup

In [5]:
from huggingface_hub import login

API_TOKEN = ''

login(token = API_TOKEN)

In [6]:
ds_persona = load_dataset("proj-persona/PersonaHub", "persona")

In [7]:
ds_persona

DatasetDict({
    train: Dataset({
        features: ['persona'],
        num_rows: 200000
    })
})

# Pipelines per Model

In [8]:

model_id_33="meta-llama/Llama-3.3-70B-Instruct"
model_id_31 ="meta-llama/Llama-3.1-70B-Instruct"
model_id_qwen3_32B = "Qwen/Qwen3-32B"

def create_pipeline(model_id):
    

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,             
        bnb_4bit_quant_type="nf4",     
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map=0,            
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16     
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
 
    pipeline_llm = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer
        )

    return pipeline_llm, model, tokenizer

torch.cuda.empty_cache()

pipeline_llama_31, _, _ = create_pipeline(model_id_31)

#torch.cuda.empty_cache()

#pipeline_llama_33, _, _  = create_pipeline(model_id_33)

#torch.cuda.empty_cache()

#_, pipeline_qwen3_32B_model, pipeline_qwen3_32B_tokenizer = create_pipeline(model_id_qwen3_32B)




Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0


In [9]:
emotions = list(enisear['Prior_Emotion'].unique())
emotions

['Fear', 'Shame', 'Guilt', 'Disgust', 'Sadness', 'Anger', 'Joy']

In [11]:

def create_prompt(tries):

    """Create prompt used for all LLMs

    Args: 
        tries: number of data elements wanted per emotion
    

    Retruns:
        complete prompt per text for the LLM with randomly selected example - 1-shot
    
    """
    persona_prompts = []

    for _ in range(tries):
        #random_persona = random.randint(0, 200000)
        #persona = ds_persona['train'][random_persona]['persona']

        for emotion in emotions:  # Loop over emotions

            persona_prompt = {}

            #  Set up the rules and persona
            synth_isear_prompt = f"""
            You are currently participating in a psychological research study on emotions.  You must complete the following sentence: 
            'I felt {emotion} when ...', describing an experience when you felt  {emotion}.
            Do not change the pattern of the sentence. Do not change the provided emotion.
            Your response must be elaborate and at least 30 words long. Do not reuse the example. Do not change the dictionary key format. 
            """
            
            example = next((e['Sentence'] for e in examples if e['emotion'] == emotion), None)

            messages = [
            {"role": "system", "content": synth_isear_prompt},
        
            {"role": "user", "content": f"Example for emotion: {emotion}"},
            {"role": "assistant", "content": example},
        
            {"role": "user", "content": 'Now, generate one response for this emotion. Follow the format strictly: {"Prior_Emotion": "", "Sentence": ""}. Strictly follow all provided instructions!'}
            ]

            #persona_prompt['persona'] = persona
            persona_prompt['emotion'] = emotion  
            persona_prompt['messages'] = messages


            persona_prompts.append(persona_prompt)

    return persona_prompts


In [12]:
messages = create_prompt(143)



In [15]:
def run_llama(messages, pipeline):
    """Run the llama models on the dataframe
    
    Args:
        df (dataframe): containing text the model should label
        pipeline : previously defined pipeline of the model

    Returns:
        dataframe containing predictions
    """
    output_list = []
    for i in tqdm(messages, desc="Processing messages"):    
        outputs = pipeline(
            i['messages'],
            max_new_tokens=1000,
            temperature = 1,
            pad_token_id=pipeline.tokenizer.eos_token_id 
        )
        
        answer = outputs[0]['generated_text'][-1]['content']
    
        output_list.append(answer)
    return output_list
output_llama31 = run_llama(messages, pipeline_llama_31)
#output_llama33 = run_llama(messages, pipeline_llama_33)


Processing messages:   1%|          | 10/1001 [00:37<1:04:00,  3.88s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing messages: 100%|██████████| 1001/1001 [1:11:33<00:00,  4.29s/it]


In [None]:
def run_qwen(messages, tokenizer, model):

    """Run qwen models on the dataframe

    
    Args:
        df (dataframe): containing text the model should label
        model and tokenizer : previously defined for the model

    Returns:
        dataframe containing predictions
    """    
    output_list = []
    
    for i in tqdm(messages, desc="Processing messages"):    

        text = tokenizer.apply_chat_template(
            i['messages'],
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        
        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
        
        # parsing thinking content
        try:
            # rindex finding 151668 (</think>)
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            index = 0
        
        content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

        output_list.append(content)
    return output_list


#output_qwen3_32B = run_qwen(messages, pipeline_qwen3_32B_tokenizer, pipeline_qwen3_32B_model)



In [16]:

def clean_entry(entry):
    """Normalizing all results to extract json format from string

    Args:
        entry (string): text from answer list of llm output

    Returns:
        json: entry in json format for easier handling
    """

    entry = entry.replace('\\n', ' ').replace("\\", '')  # Remove escape characters


    entry = re.sub(r'\(Note[^\)]+\)', '', entry)  # Remove unnecessary notes
    if not entry.endswith('}'):
        entry += '}'  # Fix missing closing bracket if needed

    if not entry.startswith('{'):
        entry = '{' + entry
    return entry

def fix_json(entry):
    try:
        return json.loads(entry) 
    except json.JSONDecodeError:
        try:
            fixed_entry = ast.literal_eval(entry)  
            return json.loads(json.dumps(fixed_entry)) 
        except (SyntaxError, ValueError):
            return None  
def fix_all(output_list):
    

    output_list_cleaned = [clean_entry(i) for i in output_list]

    output_list_fixed = [fix_json(entry) for entry in output_list_cleaned if fix_json(entry) is not None]

    return output_list_fixed, output_list_cleaned




#output_qwen3_32B_fixed, output_qwen3_32B_cleaned = fix_all(output_qwen3_32B)
output_llama31_fixed, output_llama31_cleaned = fix_all(output_llama31)
#output_llama33_fixed, output_llama33_cleaned = fix_all(output_llama33)




In [17]:

def print_fail(model, cleaned):

    """Print entries that failed, to manually correct these final entries
    """

    failed_entries = [entry for entry in cleaned if fix_json(entry) is None]

    print(f"\n\n Unmatched entries {model}: ({len(failed_entries)}):")
    for entry in failed_entries:
        print(entry)

#print_fail('Qwen3-32B', output_qwen3_32B_cleaned)
print_fail('LlaMa 3.1', output_llama31_cleaned)
#print_fail('LlaMa 3.3', output_llama33_cleaned)





 Unmatched entries LlaMa 3.1: (0):


In [47]:
# manually clean failed entries
# manually_fixed = [{"Prior_Emotion": "Fear", "Sentence": "I felt Fear when I was walking home alone through the empty streets after a late shift at the library, and I heard a sudden loud crash behind me, making me freeze in place, unsure if it was a threat or just the wind."}]



In [18]:
#qwen3_32B_results = pd.DataFrame(output_qwen3_32B_fixed)
llama31_results = pd.DataFrame(output_llama31_fixed)
#llama33_results = pd.DataFrame(output_llama33_fixed)


llama31_results.to_csv('llama31_no_persona.csv')
#llama33_results.to_csv('llama33_no_persona.csv')

In [28]:
for idx, row in llama33_results.iterrows():
    if 35 <= idx <= 41:
        print(row['Sentence'])


I felt Fear when I was on a solo hike in the mountains and a sudden dense fog rolled in, reducing visibility to almost zero, making it impossible for me to see the trail or find my way back.
I felt Shame when I accidentally posted a personal story on my professional social media account, which was seen by my colleagues and clients, causing me to feel deeply embarrassed and unprofessional.
I felt Guilt when I convinced my younger sister to skip school and go to the mall with me, and afterwards she got in trouble with our parents for missing a crucial exam, which made me realize the gravity of my reckless suggestion.
I felt Disgust when I stumbled upon a blog with outdated and poorly written content that was still trying to garner attention in the age of Instagram and TikTok, where short-form, visually-appealing posts have become the norm.
I felt Sadness when I saw a photo of my grandfather, who had passed away a few years ago, and it brought back a flood of memories of the times we spen

In [None]:
def words_stats(results):

    """Check if model correctly produced samples of ~ 30 words.
    
    """
    results['Word_Count'] = results['Sentence'].apply(lambda x: len(x.split()))

    mean_length = results['Word_Count'].mean() 
    std_dev = results['Word_Count'].std()  
    quartiles = np.percentile(results['Word_Count'], [25, 50, 75])
    
    print(f"Mean Text Length: {mean_length:.2f} words")
    print(f"Standard Deviation: {std_dev:.2f} words")
    print(f"Quartiles (25th, 50th, 75th percentiles): {quartiles}")
    print(f"Max length of words: {results['Word_Count'].max()}")
    print(f"Min length of words: {results['Word_Count'].min()}")

#words_stats(qwen3_30BA3B_results)
#words_stats(llama31_results)
words_stats(llama33_results)

In [19]:

def remove_emotions_and_i_felt(text):

    
    text = text.lower()

    text = re.sub(r"\b(shame|disgust|guilt|joy|sadness|anger|fear|ashamed|disgusted|guilty|joyful|angry|sad|scared)\b", "", text)

    text = re.sub(r"\b(i felt|i feel|felt|feel)\b\s*", "", text, flags=re.IGNORECASE)
    return text.encode().decode('unicode_escape')



#qwen3_32B_results['Sentence'] = qwen3_32B_results['Sentence'].apply(remove_emotions_and_i_felt)

llama31_results['Sentence'] = llama31_results['Sentence'].apply(remove_emotions_and_i_felt)

#llama33_results['Sentence'] = llama33_results['Sentence'].apply(remove_emotions_and_i_felt)


#qwen3_30BA3B_results['Sentence'] = qwen3_30BA3B_results['Sentence'].apply(remove_emotions_and_i_felt)


In [20]:
llama31_results.to_csv('llama31_no_persona.csv')

In [None]:
def dupli_vaL_cts(results):

    """Function to check for duplicate values in LLM results
    """

    duplicates = results[results['Sentence'].duplicated(keep=False)]
    
    print("\n Duplicate Rows:")
    print(duplicates)
    
    print("\nFull Text of Duplicate Sentences:")
    for idx, row in duplicates.iterrows():
        print(f"Index {idx}: {row['Sentence']}")
    
    print("\n Value Counts for 'Prior_Emotion':")
    print(results['Prior_Emotion'].value_counts())
dupli_vaL_cts(llama33_results)


In [22]:

#qwen3_32B_results.to_csv('qwen3_32B_full_emotions.csv')
#llama31_results.to_csv('llama31_full_emotions.csv')
llama33_results.to_csv('llama33_full_emotions.csv')