In [None]:
# import os
import pandas as pd
from tqdm import tqdm


# add secrets

In [2]:
meld_files = {
    "train": "data/MELD.Raw/train_sent_emo.csv",
    "dev": "data/MELD.Raw/dev_sent_emo.csv",
    "test": "data/MELD.Raw/test_sent_emo.csv"
}

datasets = {}
for split, file in meld_files.items():
    df = pd.read_csv(file)
    datasets[split] = df
    print(f"Loaded {file} with {len(df)} samples")

df_all = pd.concat(datasets.values(), ignore_index=True)

Loaded data/MELD.Raw/train_sent_emo.csv with 9989 samples
Loaded data/MELD.Raw/dev_sent_emo.csv with 1109 samples
Loaded data/MELD.Raw/test_sent_emo.csv with 2610 samples


In [9]:
print(df_all.head())

   Sr No.                                          Utterance          Speaker  \
0       1  also I was the point person on my companys tr...         Chandler   
1       2                   You mustve had your hands full.  The Interviewer   
2       3                            That I did. That I did.         Chandler   
3       4      So lets talk a little bit about your duties.  The Interviewer   
4       5                             My duties?  All right.         Chandler   

    Emotion Sentiment  Dialogue_ID  Utterance_ID  Season  Episode  \
0   neutral   neutral            0             0       8       21   
1   neutral   neutral            0             1       8       21   
2   neutral   neutral            0             2       8       21   
3   neutral   neutral            0             3       8       21   
4  surprise  positive            0             4       8       21   

      StartTime       EndTime  
0  00:16:16,059  00:16:21,731  
1  00:16:21,940  00:16:23,442  
2 

In [4]:
%%capture
!pip install -U transformers datasets bitsandbytes accelerate

In [None]:
from transformers import pipeline
from transformers import BitsAndBytesConfig
from transformers import set_seed
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

set_seed(42)

pretrained_model_name = "google/gemma-2-2b"
instruction_tuned_model_name = "google/gemma-2-2b-it"


In [6]:
def load_model_and_tokenizer(
               model_name,
               model_kwargs={'quantization_config':quantization_config if torch.cuda.is_available() else None,
                             'max_length': 1024,
                             'torch_dtype':torch.float16,
                             'device_map': 'auto'}
               ):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = 'left'

  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      **model_kwargs,
  )

  return model, tokenizer

In [7]:
model_pretrained, tokenizer_pretrained = load_model_and_tokenizer(pretrained_model_name)
model_instruction_tuned, tokenizer_instruction_tuned = load_model_and_tokenizer(instruction_tuned_model_name)

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [8]:
@torch.no_grad()
def run_model(prompt,
              model,
              tokenizer,
              apply_chat_template=False,
              generation_kwargs={
                  'min_new_tokens': 1,
                  'max_new_tokens': 25,
                  'temperature': 0.1,
                  'top_p': 0.95,
                  'do_sample': True,
              }
              ):
  set_seed(42) 

  if apply_chat_template:
    prompt = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt}],
        tokenize=False
    )

  inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
  output_ids = model.generate(
      **inputs,
      **generation_kwargs
  )

  input_ids = inputs['input_ids'][0]
  input_len = (input_ids != tokenizer.pad_token_id).sum().item()
  tokens = output_ids[0][input_len:]
  output = tokenizer.decode(tokens, skip_special_tokens=True)
  output = output.strip().split("\n")[0]

  return output

@torch.no_grad()
def run_model_batch(prompts,
              model,
              tokenizer,
              apply_chat_template=False,
              generation_kwargs={
                  'min_new_tokens': 1,
                  'max_new_tokens': 25,
                  'temperature': 0.1,
                  'top_p': 0.95,
                  'do_sample': True
              },
              tokenizer_kwargs={
                  'padding':'longest',
              }
            ):
  set_seed(42) 

  if apply_chat_template:
    prompts = [
        tokenizer.apply_chat_template(
          [{'role': 'user', 'content': prompt}],
          tokenize=False,
        ) for prompt in prompts
    ]

  inputs = tokenizer(prompts, return_tensors='pt', **tokenizer_kwargs).to(model.device)
  output_ids = model.generate(
      **inputs,
      **generation_kwargs
  )

  batch_output = []
  for i in range(len(prompts)):
    input_ids = inputs['input_ids'][i]
    input_len = (input_ids != tokenizer.pad_token_id).sum().item()
    tokens = output_ids[i][input_len:]

    text = tokenizer.decode(tokens, skip_special_tokens=True)
    text = text.strip().split("\n")[0]

    batch_output.append(text)

  return batch_output

In [None]:
PRETRAINED_PROMPT = "Classify the emotion of this text. Text: {input} Emotion:"

EMOTIONS = [
    "neutral", "joy", "surprise",
    "anger", "sadness", "disgust", "fear"
]

# Function to map model output to closest emotion label
def map_to_emotion_label(model_output, emotions=EMOTIONS):
    """
    Maps model output to the closest emotion label.
    """
    model_output_lower = model_output.lower().strip()
    
    # Check for exact match first
    for emotion in emotions:
        if emotion in model_output_lower:
            return emotion
    
    return "neutral" 

# Sample a subset for testing (adjust sample_size as needed)
sample_size = min(100, len(df_all))  # Change to len(df_all) to test all
df_sample = df_all.sample(n=sample_size, random_state=42)

print(f"LLM Emotion Classification on {len(df_sample)} samples...\n")

predictions = []
actual_labels = []
correct = 0

for idx, (_, row) in enumerate(df_sample.iterrows()):
    utterance = row['Utterance']
    actual_emotion = row['Emotion']
    
    # Generate prediction
    model_output = run_model(PRETRAINED_PROMPT.replace("{input}", utterance), 
                            model_pretrained, tokenizer_pretrained)
    predicted_emotion = map_to_emotion_label(model_output)
    
    predictions.append(predicted_emotion)
    actual_labels.append(actual_emotion)
    
    if predicted_emotion == actual_emotion:
        correct += 1
    
    # Print first 10 examples
    if idx < 10:
        print(f"Text: {utterance[:60]}...")
        print(f"  Predicted: {predicted_emotion}, Actual: {actual_emotion}\n")

# Calculate accuracy and create results dataframe
accuracy = (correct / len(df_sample)) * 100
print(f"Accuracy: {correct}/{len(df_sample)} = {accuracy:.2f}%")

# Create results dataframe
results_df = df_sample.copy()
results_df['predicted_emotion'] = predictions
results_df['correct'] = [p == a for p, a in zip(predictions, actual_labels)]

print(f"\nCorrect predictions by emotion:")
print(results_df.groupby('Emotion')['correct'].agg(['sum', 'count', 'mean']))

print(f"\nPredicted vs Actual Emotion Distribution:")
print(pd.DataFrame({
    'Predicted': predictions,
    'Actual': actual_labels
}).value_counts().head(10))


Testing emotion classification on 100 samples...



KeyboardInterrupt: 