### Install Dependencies

In [3]:
!pip3 install pandas
!pip3 install numpy
!pip3 install torch
!pip3 install transformers
!pip3 install accelerate -U
!pip3 install ray[tune]
!pip3 install hyperopt
!pip3 install sklearn
!pip3 install datasets

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.7 MB/s[0m eta [36m0:00:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import libraries

In [4]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, BertTokenizerFast, BertForSequenceClassification
from transformers.trainer_callback import EarlyStoppingCallback
import pandas as pd
import torch
import accelerate
import numpy as np
from datasets import Dataset

### Load data

In [23]:
train_path = 'drive/MyDrive/data/VUA/VUA_formatted_train.csv'
eval_path = 'drive/MyDrive/data/VUA/VUA_formatted_val.csv'
test_path = 'drive/MyDrive/data/VUA/VUA_formatted_test.csv'

def load_dataset(train, eval, test, encoding):
    train_df = pd.read_csv(train, encoding=encoding)
    eval_df = pd.read_csv(eval, encoding=encoding)
    test_df = pd.read_csv(test, encoding=encoding)

    return train_df, eval_df, test_df

train_df, eval_df, test_df = load_dataset(train_path, eval_path, test_path, encoding='ISO-8859-1')

### Preprocess Data and Tokenize input

In [24]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_data(df):
    tokenized_sentences = []
    attention_masks = []
    verb_labels = []

    for _, row in df.iterrows():
        sentence = row['sentence']
        verb_idx = row['verb_idx']
        label = row['label']

        # Tokenize the sentence and get the respective wordpiece token positions
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask = [1] * len(input_ids)

        # Verb may split into multiple wordpiece tokens
        start_token_idx = len(tokenizer.tokenize(sentence[:verb_idx]))
        end_token_idx = start_token_idx + len(tokenizer.tokenize(row['verb'])) - 1

        verb_label = [0] * len(tokens)
        for idx in range(start_token_idx, end_token_idx + 1):
            verb_label[idx] = label

        tokenized_sentences.append(input_ids)
        attention_masks.append(mask)
        verb_labels.append(verb_label)

    return tokenized_sentences, attention_masks, verb_labels

train_encodings, train_masks, train_labels = preprocess_data(train_df)
eval_encodings, eval_masks, eval_labels = preprocess_data(eval_df)
test_encodings, test_masks, test_labels = preprocess_data(test_df)

In [25]:
MAX_LEN = 128

def preprocess_data(df):
    tokenized_sentences = []
    attention_masks = []
    verb_labels = []

    for _, row in df.iterrows():
        sentence = row['sentence']
        label = row['label']

        # Tokenize the sentence and get the respective wordpiece token positions
        tokens = tokenizer.tokenize(sentence)[:MAX_LEN - 2]  # Account for BERT model [CLS] and [SEP]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask = [1] * len(input_ids)

        # Padding
        while len(input_ids) < MAX_LEN:
            input_ids.append(0)
            mask.append(0)

        tokenized_sentences.append(input_ids)
        attention_masks.append(mask)
        verb_labels.append(label)  # single label

    return tokenized_sentences, attention_masks, verb_labels

train_encodings, train_masks, train_labels = preprocess_data(train_df)
eval_encodings, eval_masks, eval_labels = preprocess_data(eval_df)
test_encodings, test_masks, test_labels = preprocess_data(test_df)


### Load data for training

In [26]:
def create_hf_dataset(encodings, masks, labels):
    return Dataset.from_dict({
        'input_ids': encodings,
        'attention_mask': masks,
        'labels': labels
    })

train_dataset = create_hf_dataset(train_encodings, train_masks, train_labels)
eval_dataset = create_hf_dataset(eval_encodings, eval_masks, eval_labels)
test_dataset = create_hf_dataset(test_encodings, test_masks, test_labels)

### Training

In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions)
    r, _ = pearsonr(labels, predictions)

    return {"f1": f1, "pearson_r": r}

In [28]:

from sklearn.metrics import f1_score
from scipy.stats import pearsonr

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)


# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # The actual number of epochs will be set by max_steps
    max_steps=8000,
    logging_dir='./logs',
    logging_steps=2000,  # Log every 500 steps
    save_steps=2000,    # Save the model every 1000 steps
    evaluation_strategy="steps",  # Evaluate the model every logging_steps
    save_total_limit=2,  # Only last 2 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    remove_unused_columns=False,  # Important to set this to keep the 'labels' column
    output_dir="./metaphor_detection_model",
)

from transformers import DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
     compute_metrics=compute_metrics,  # Pass the metrics function here
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,F1,Pearson R
2000,0.5542,0.504015,0.541799,0.368997
4000,0.4907,0.548893,0.43956,0.323031
6000,0.4379,0.558555,0.515152,0.337666


In [15]:
# Evaluate on Test Data
results = trainer.evaluate()

# After training
results_on_test_data = trainer.evaluate(test_dataset)
print(results_on_test_data)

{'eval_loss': 0.6245772838592529, 'eval_f1': 0.00564652738565782, 'eval_pearson_r': 0.01804082755936968, 'eval_runtime': 44.3416, 'eval_samples_per_second': 132.449, 'eval_steps_per_second': 16.576}


In [17]:
from datasets import load_dataset
import numpy as np
from scipy.stats import pearsonr
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments

# Assuming you have the csv data stored in 'emobank.csv'
dataset = load_dataset('csv', data_files='drive/MyDrive/data/EMOBANK/emobank.csv')

train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dev_dataset = dataset['train'].filter(lambda example: example['split'] == 'dev')


def normalize_values(dataset):
    for key in ['V', 'A', 'D']:
        dataset[key] = (dataset[key] - 1) / 4
    return dataset

train_dataset = train_dataset.map(normalize_values)
test_dataset = test_dataset.map(normalize_values)
dev_dataset = dev_dataset.map(normalize_values)




Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/10062 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10062 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10062 [00:00<?, ? examples/s]

Map:   0%|          | 0/8062 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
# Preprocess and tokenize your dataset as you've done previously

from transformers import BertTokenizer
# Display the first few samples of the 'dev' split
dev_dataset = dev_dataset.filter(lambda example: example['text'] is not None)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
#    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
# For Python 3.9
     tokenized = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
     return {key: value.numpy() for key, value in tokenized.items()}

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

from transformers import BertTokenizer
# Display the first few samples of the 'dev' split
dev_dataset = dev_dataset.filter(lambda example: example['text'] is not None)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8062 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [19]:
def format_dataset(example):
    example['labels'] = [example['V'], example['A'], example['D']]
    return example

train_dataset = train_dataset.map(format_dataset)
test_dataset = test_dataset.map(format_dataset)
dev_dataset = dev_dataset.map(format_dataset)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/8062 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
from transformers import BertConfig, BertModel, Trainer, TrainingArguments
import torch.nn as nn
from scipy.stats import pearsonr



class VADRegressor(nn.Module):
    def __init__(self, pretrained_model_name, config):
        super(VADRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name, config=config)
        self.regressor = nn.Linear(config.hidden_size, 3)  # Three outputs for V, A, D

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        logits = self.regressor(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, logits) if loss is not None else logits

config = BertConfig.from_pretrained("kangela/Metaphor-FineTuned-BERT")
model = VADRegressor("kangela/Metaphor-FineTuned-BERT", config=config)

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=12,
    learning_rate=5e-5,
    output_dir='./metaphor_on_emotion_results2',
    logging_dir='./logs',
    logging_steps=1000,  # Adjust if needed
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

Some weights of BertModel were not initialized from the model checkpoint at kangela/Metaphor-FineTuned-BERT and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

In [22]:
predictions = trainer.predict(test_dataset).predictions
ground_truth = test_dataset['labels']

pearson_v = pearsonr(predictions[:, 0], ground_truth[:, 0])[0]
pearson_a = pearsonr(predictions[:, 1], ground_truth[:, 1])[0]
pearson_d = pearsonr(predictions[:, 2], ground_truth[:, 2])[0]

print(f"Pearson r values: Valence: {pearson_v}, Arousal: {pearson_a}, Dominance: {pearson_d}")


Pearson r values: Valence: -0.05595852549777171, Arousal: 0.009599565013209375, Dominance: -0.002705628206251998
