# Emotion Intensity Regression using EmoLLM

In [1]:
%%HTML
<style>
    body{
 --vscode-font-family: "ComicShannsMono Nerd Font";
    }
</style>

## Import Libraries

In [2]:
import re
import os
import torch
import warnings
import pandas as pd

from tqdm import tqdm
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings("ignore")

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

CUDA_VISIBLE_DEVICES: 1


## Load the Model

In [4]:
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     llm_int8_threshold=6.0,
#     llm_int8_has_fp16_weight=True,
# )

tokenizer = AutoTokenizer.from_pretrained(
    'lzw1008/Emollama-chat-7b',
    device_map='auto',
    cache_dir="cache",
    use_fast=False
) 
tokenizer.pad_token_id = 0
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2
tokenizer.padding_side = 'left'

model = AutoModelForCausalLM.from_pretrained(
    'lzw1008/Emollama-chat-7b',
    device_map='auto',
    # quantization_config=quantization_config,
    # torch_type=torch.float16,
    cache_dir="cache",
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Load the data

In [5]:
human_story_df = pd.read_csv("../data/tokenized_human_stories.csv")
model_story_df = pd.read_csv("../data/tokenized_model_stories.csv")

human_story_df.head()

Unnamed: 0.1,Unnamed: 0,Prompt,Story,Model,Length,Sentences,Sentences Length
0,0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...",Human,1076,"['3,000 years have I been fighting.', 'Every m...",21
1,1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...",Human,1315,"[""“Dad, you 're on TV again !” I heard Eric 's...",17
2,2,A scientific study proves that all humans have...,"When Tyler entered the ward, his daughter Vale...",Human,4420,"['When Tyler entered the ward, his daughter Va...",44
3,3,Write a story about an elderly wizard and his ...,His body was failing. He had taken care of it ...,Human,4575,"['His body was failing.', 'He had taken care o...",58
4,4,"You have become death, destroyer of worlds.","I saw the button. It was simple, red, no words...",Human,842,"['I saw the button.', 'It was simple, red, no ...",11


In [6]:
human_story_df['Sentences'] = human_story_df['Sentences'].apply(eval).tolist()
model_story_df['Sentences'] = model_story_df['Sentences'].apply(eval).tolist()

## Inference

In [7]:
max_new_tokens = 256
generation_config = dict(
    temperature=0.9,
    top_k=30,
    top_p=0.6,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.2,
    max_new_tokens=max_new_tokens,
    pad_token_id=tokenizer.pad_token_id,
)


In [8]:
def extract_intensity_score(text) -> float:
    pattern = r'Intensity\s+Score:\s*([\d\.]+)'
    match = re.search(pattern, text)
    if match:
        return float(match.group(1))
    else:
        return -1

def get_prompt(sentence_list: list, emotion: str) -> list:
    prompt_template=f"""\
Task: Assign a numerical value between 0 (least {emotion}) and 1 (most {emotion}) to represent the intensity of emotion {emotion} expressed in the part of story.\n\
Text: <STORY>\n\
Emotion: {emotion}\n\
Intensity Score:\
    """
    prompt_list = []
    for i in range(len(sentence_list)):
        prompt = prompt_template
        prompt = prompt.replace("<STORY>", sentence_list[i])
        prompt_list.append(prompt)
    return prompt_list


model.eval()
def inference(
        sentence_list: list,
        emotion_list: list,
        batch_size: int = 1,
) -> dict:
    output_dict = {}
    for emotion in emotion_list:
        prompt_list = get_prompt(sentence_list, emotion)
        output_dict[emotion] = []
        for i in range(0, len(prompt_list), batch_size):
            batch = prompt_list[i : min(i+batch_size, len(prompt_list))]
            inputs = tokenizer(batch, return_tensors='pt', padding=True)
            input_ids = inputs.input_ids.to(model.device)
            attention_mask = inputs.attention_mask.to(model.device)
            # model generate output
            output = model.generate(
                input_ids,
                attention_mask=attention_mask,
                **generation_config
            )
            responses = tokenizer.batch_decode(
                output,
                skip_special_tokens=True,
                space_between_special_tokens=False
            )
            # check if the output is valid
            for j, response in enumerate(responses):
                intensity_score = extract_intensity_score(response)
                output_dict[emotion].append(intensity_score)
            
    return output_dict


In [9]:
PLUTCHIK_EMOTION_LIST = [
    "anger",
    "anticipation",
    "joy",
    "trust",
    "fear",
    "surprise",
    "sadness",
    "disgust",
]

# iterate over the human stories
human_emotion_score_list = []
sentence_list = human_story_df['Sentences']
for i in tqdm(range(len(sentence_list)), desc="Human Stories"):
    output_response = inference(
        sentence_list[i],
        emotion_list=PLUTCHIK_EMOTION_LIST,
        batch_size=12,
    )
    human_emotion_score_list.append(output_response)
    
model_emotion_score_list = []
sentence_list = model_story_df['Sentences']
for i in tqdm(range(len(sentence_list)), desc="Model Stories"):
    output_response = inference(
        sentence_list[i],
        emotion_list=PLUTCHIK_EMOTION_LIST,
        batch_size=12,
    )
    model_emotion_score_list.append(output_response)

Human Stories: 100%|██████████| 96/96 [1:05:05<00:00, 40.68s/it]
Model Stories: 100%|██████████| 576/576 [5:18:26<00:00, 33.17s/it]  


In [10]:
scored_human_story_df = human_story_df.assign(**pd.DataFrame(human_emotion_score_list))
scored_model_story_df = model_story_df.assign(**pd.DataFrame(model_emotion_score_list))

## Save the output

In [11]:
human_story_emotion_scored_output = "../data/human_story_emotion_scored.csv"
model_story_emotion_scored_output = "../data/model_story_emotion_scored.csv"

scored_human_story_df.to_csv(human_story_emotion_scored_output, index=False)
scored_model_story_df.to_csv(model_story_emotion_scored_output, index=False)