### Install Dependencies

In [None]:
!pip3 install transformers
!pip3 install datasets
!pip3 install accelerate -U
!pip3 install transformers[torch]

### Import libraries

In [None]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
from transformers import BertTokenizer
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, BertTokenizerFast, BertForSequenceClassification
from transformers.trainer_callback import EarlyStoppingCallback
import torch
import accelerate
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import f1_score


### Load data

In [None]:
dataset = load_dataset('csv', data_files='data/EMOBANK/emobank.csv')

train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dev_dataset = dataset['train'].filter(lambda example: example['split'] == 'dev')


def normalize_values(dataset):
    for key in ['V', 'A', 'D']:
        dataset[key] = (dataset[key] - 1) / 4
    return dataset

train_dataset = train_dataset.map(normalize_values)
test_dataset = test_dataset.map(normalize_values)
dev_dataset = dev_dataset.map(normalize_values)


### Preprocess Data and Tokenize input

In [None]:


dev_dataset = dev_dataset.filter(lambda example: example['text'] is not None)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
# For Python 3.9
    # tokenized = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    # return {key: value.numpy() for key, value in tokenized.items()}

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Clean data
dev_dataset = dev_dataset.filter(lambda example: example['text'] is not None)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

### Format Dataset for 3 lavel training

In [None]:
def format_dataset(example):
    example['labels'] = [example['V'], example['A'], example['D']]
    return example

train_dataset = train_dataset.map(format_dataset)
test_dataset = test_dataset.map(format_dataset)
dev_dataset = dev_dataset.map(format_dataset)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


### Initialize Model

In [None]:
from transformers import BertConfig, BertForSequenceClassification

config = BertConfig.from_pretrained("bert-base-cased", num_labels=3)  # 3 for V, A, D
model = BertForSequenceClassification.from_pretrained("bert-base-cased", config=config)

### Setup training and metrics

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=12,
    learning_rate=5e-5,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

### Train

In [None]:
trainer.train()

### Evaluate

In [None]:
import numpy as np
from scipy.stats import pearsonr

predictions = trainer.predict(test_dataset)
predicted_values = np.array(predictions.predictions)

ground_truth = test_dataset['labels']

pearson_v = pearsonr(predicted_values[:, 0], ground_truth[:, 0])[0]
pearson_a = pearsonr(predicted_values[:, 1], ground_truth[:, 1])[0]
pearson_d = pearsonr(predicted_values[:, 2], ground_truth[:, 2])[0]

print(f"Pearson r values: Valence: {pearson_v}, Arousal: {pearson_a}, Dominance: {pearson_d}")


Pearson r values: Valence: 0.7534930780797127, Arousal: 0.5561622928181004, Dominance: 0.4736017701452015


## Readjust and run model for Metaphor

In [None]:
train_path = 'data/VUA/VUA_formatted_train.csv'
eval_path = 'data/VUA/VUA_formatted_val.csv'
test_path = 'data/VUA/VUA_formatted_test.csv'

def load_dataset(train, eval, test, encoding):
    train_df = pd.read_csv(train, encoding=encoding)
    eval_df = pd.read_csv(eval, encoding=encoding)
    test_df = pd.read_csv(test, encoding=encoding)

    return train_df, eval_df, test_df

train_df, eval_df, test_df = load_dataset(train_path, eval_path, test_path, encoding='ISO-8859-1')

In [None]:
MAX_LEN = 128

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_data(df):
    tokenized_sentences = []
    attention_masks = []
    verb_labels = []

    for _, row in df.iterrows():
        sentence = row['sentence']
        label = row['label']

        # Tokenize the sentence and get the respective wordpiece token positions
        tokens = tokenizer.tokenize(sentence)[:MAX_LEN - 2]  # Account for BERT model [CLS] and [SEP]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask = [1] * len(input_ids)

        # Padding
        while len(input_ids) < MAX_LEN:
            input_ids.append(0)
            mask.append(0)

        tokenized_sentences.append(input_ids)
        attention_masks.append(mask)

        # Keep the labels as integers
        verb_labels.append(label)

    return tokenized_sentences, attention_masks, verb_labels

train_encodings, train_masks, train_labels = preprocess_data(train_df)
eval_encodings, eval_masks, eval_labels = preprocess_data(eval_df)
test_encodings, test_masks, test_labels = preprocess_data(test_df)


In [None]:
def create_hf_dataset(encodings, masks, labels):
    return Dataset.from_dict({
        'input_ids': encodings,
        'attention_mask': masks,
        'labels': labels
    })

train_dataset = create_hf_dataset(train_encodings, train_masks, train_labels)
eval_dataset = create_hf_dataset(eval_encodings, eval_masks, eval_labels)
test_dataset = create_hf_dataset(test_encodings, test_masks, test_labels)

In [None]:
model = BertForSequenceClassification.from_pretrained('./results/checkpoint-6048', problem_type="single_label_classification")
model.classifier = torch.nn.Linear(in_features=768, out_features=2)
model.num_labels = 2


from transformers import Trainer, TrainingArguments

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions)
    r, _ = pearsonr(labels, predictions)
    return {"f1": f1, "pearson_r": r}

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_dir='./logs_metaphor',
    logging_steps=2000,
    save_steps=2000,
    evaluation_strategy="steps",
    save_total_limit=2,
    learning_rate=2e-5,
    remove_unused_columns=False,
    output_dir="./drive/MyDrive/results/base_metaphor",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=1940, training_loss=0.5510524474468428, metrics={'train_runtime': 402.2288, 'train_samples_per_second': 38.575, 'train_steps_per_second': 4.823, 'total_flos': 1020607783741440.0, 'train_loss': 0.5510524474468428, 'epoch': 1.0})

In [None]:
results = trainer.evaluate()

results_on_test_data = trainer.evaluate(test_dataset)
print(results_on_test_data)

{'eval_loss': 0.544163703918457, 'eval_f1': 0.35887096774193544, 'eval_pearson_r': 0.2600936847597024, 'eval_runtime': 47.4869, 'eval_samples_per_second': 123.676, 'eval_steps_per_second': 15.478, 'epoch': 1.0}
