### Install Dependencies

In [39]:
!pip3 install transformers
!pip3 install datasets


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Import libraries

In [6]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

### Load data

In [46]:
from datasets import load_dataset

# Assuming you have the csv data stored in 'emobank.csv'
dataset = load_dataset('csv', data_files='data/EMOBANK/emobank.csv')

train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dev_dataset = dataset['train'].filter(lambda example: example['split'] == 'dev')


def normalize_values(dataset):
    for key in ['V', 'A', 'D']:
        dataset[key] = (dataset[key] - 1) / 4
    return dataset

train_dataset = train_dataset.map(normalize_values)
test_dataset = test_dataset.map(normalize_values)
dev_dataset = dev_dataset.map(normalize_values)


Filter: 100%|██████████| 10062/10062 [00:00<00:00, 208387.67 examples/s]
Filter: 100%|██████████| 10062/10062 [00:00<00:00, 201248.83 examples/s]
Filter: 100%|██████████| 10062/10062 [00:00<00:00, 193568.17 examples/s]
Map: 100%|██████████| 8062/8062 [00:00<00:00, 13762.61 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 13095.54 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 13255.75 examples/s]


### Preprocess Data and Tokenize input

In [47]:
from transformers import BertTokenizer
# Display the first few samples of the 'dev' split
dev_dataset = dev_dataset.filter(lambda example: example['text'] is not None)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)




Filter: 100%|██████████| 1000/1000 [00:00<00:00, 126919.36 examples/s]
Map: 100%|██████████| 8062/8062 [00:01<00:00, 4504.02 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4280.00 examples/s]
Map: 100%|██████████| 999/999 [00:00<00:00, 4267.56 examples/s]


### Format Dataset for 3 lavel training

In [52]:
def format_dataset(example):
    example['labels'] = [example['V'], example['A'], example['D']]
    return example

train_dataset = train_dataset.map(format_dataset)
test_dataset = test_dataset.map(format_dataset)
dev_dataset = dev_dataset.map(format_dataset)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 8062/8062 [00:03<00:00, 2631.48 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2714.01 examples/s]
Map: 100%|██████████| 999/999 [00:00<00:00, 2858.81 examples/s]

{'input_ids': tensor([101, 119, 119, 107, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      




### Initialize Model

In [53]:
from transformers import BertConfig, BertForSequenceClassification

config = BertConfig.from_pretrained("bert-base-cased", num_labels=3)  # 3 for V, A, D
model = BertForSequenceClassification.from_pretrained("bert-base-cased", config=config)

Downloading model.safetensors: 100%|██████████| 436M/436M [00:36<00:00, 11.9MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Setup training and metrics

In [57]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=12,
    learning_rate=5e-5,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,  # Adjust if needed
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",  # Specifying evaluation strategy
    save_strategy="epoch",  # Specifying save strategy
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

### Train

In [58]:
trainer.train()

  1%|          | 58/6048 [03:51<6:50:18,  4.11s/it]

KeyboardInterrupt: 

### Evaluate

In [21]:
import numpy as np
from scipy.stats import pearsonr

predictions = trainer.predict(test_dataset)
predicted_values = np.array(predictions.predictions)

# Obtain ground truth values for V, A, D
ground_truth = test_dataset['labels']

pearson_v = pearsonr(predicted_values[:, 0], ground_truth[:, 0])[0]
pearson_a = pearsonr(predicted_values[:, 1], ground_truth[:, 1])[0]
pearson_d = pearsonr(predicted_values[:, 2], ground_truth[:, 2])[0]

print(f"Pearson r values: Valence: {pearson_v}, Arousal: {pearson_a}, Dominance: {pearson_d}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at kangela/Metaphor-FineTuned-BERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

  0%|          | 0/7760 [04:39<?, ?it/s]            

{'loss': 0.1233, 'learning_rate': 4.993556701030928e-05, 'epoch': 0.01}



  0%|          | 0/7760 [06:33<?, ?it/s]            

{'loss': 0.0112, 'learning_rate': 4.987113402061856e-05, 'epoch': 0.01}



  0%|          | 0/7760 [08:33<?, ?it/s]            

{'loss': 0.0073, 'learning_rate': 4.980670103092784e-05, 'epoch': 0.02}




KeyboardInterrupt: 