### Install Dependencies

In [1]:
!pip3 install pandas
!pip3 install numpy
!pip3 install torch
!pip3 install transformers
!pip3 install accelerate -U
!pip3 install ray[tune]
!pip3 install hyperopt
!pip3 install sklearn
!pip3 install tqdm
!pip3 install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Import libraries

In [2]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForTokenClassification, BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback
import pandas as pd
import torch
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [3]:
train_path = 'data/VUA/VUA_formatted_train.csv'
eval_path = 'data/VUA/VUA_formatted_val.csv'
test_path = 'data/VUA/VUA_formatted_test.csv'

def load_dataset(train, eval, test, encoding):
    train_df = pd.read_csv(train, encoding=encoding)
    eval_df = pd.read_csv(eval, encoding=encoding)
    test_df = pd.read_csv(test, encoding=encoding)

    return train_df, eval_df, test_df

train_df, eval_df, test_df = load_dataset(train_path, eval_path, test_path, encoding='ISO-8859-1')

### Preprocess Data and Tokenize input

In [4]:
tokenizer = AutoTokenizer.from_pretrained("kangela/Metaphor-FineTuned-BERT")

def preprocess_data(df):
    tokenized_sentences = []
    attention_masks = []
    verb_labels = []
    
    for _, row in df.iterrows():
        sentence = row['sentence']
        verb_idx = row['verb_idx']
        label = row['label']
        
        # Tokenize the sentence and get the respective wordpiece token positions
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask = [1] * len(input_ids)
        
        # Verb may split into multiple wordpiece tokens
        start_token_idx = len(tokenizer.tokenize(sentence[:verb_idx]))
        end_token_idx = start_token_idx + len(tokenizer.tokenize(row['verb'])) - 1
        
        verb_label = [0] * len(tokens)
        for idx in range(start_token_idx, end_token_idx + 1):
            verb_label[idx] = label
        
        tokenized_sentences.append(input_ids)
        attention_masks.append(mask)
        verb_labels.append(verb_label)
    
    return tokenized_sentences, attention_masks, verb_labels

train_encodings, train_masks, train_labels = preprocess_data(train_df)
eval_encodings, eval_masks, eval_labels = preprocess_data(eval_df)
test_encodings, test_masks, test_labels = preprocess_data(test_df)

[[2640, 1024, 4625, 3210, 2058, 1996, 9121, 4665, 12517, 1024, 2103, 3665, 2071, 2574, 2022, 2067, 2006, 1996, 2157, 2650, 1010, 2758, 5655, 6054, 2100], [2640, 1024, 4625, 3210, 2058, 1996, 9121, 4665, 12517, 1024, 2103, 3665, 2071, 2574, 2022, 2067, 2006, 1996, 2157, 2650, 1010, 2758, 5655, 6054, 2100], [2715, 19216, 1010, 2004, 2087, 6803, 13481, 2113, 1010, 4445, 6073, 4496, 23114, 1010, 4496, 2079, 2027, 4897, 1012], [2715, 19216, 1010, 2004, 2087, 6803, 13481, 2113, 1010, 4445, 6073, 4496, 23114, 1010, 4496, 2079, 2027, 4897, 1012], [2715, 19216, 1010, 2004, 2087, 6803, 13481, 2113, 1010, 4445, 6073, 4496, 23114, 1010, 4496, 2079, 2027, 4897, 1012], [2715, 19216, 1010, 2004, 2087, 6803, 13481, 2113, 1010, 4445, 6073, 4496, 23114, 1010, 4496, 2079, 2027, 4897, 1012], [21185, 1010, 5024, 2135, 2328, 1010, 7132, 2006, 1996, 4044, 1010, 2027, 2024, 2411, 2019, 7812, 2433, 1997, 2103, 3665, 1012], [2027, 2036, 4654, 8743, 1037, 18987, 2200, 2172, 1997, 2037, 2219, 2004, 2027, 18036, 5

In [12]:
MAX_LEN = 512

def preprocess_data(df):
    tokenized_sentences = []
    attention_masks = []
    verb_labels = []
    
    for _, row in df.iterrows():
        sentence = row['sentence']
        label = row['label']
        
        # Tokenize the sentence and get the respective wordpiece token positions
        tokens = tokenizer.tokenize(sentence)[:MAX_LEN - 2]  # -2 to account for [CLS] and [SEP] tokens
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask = [1] * len(input_ids)
        
        # Padding
        while len(input_ids) < MAX_LEN:
            input_ids.append(0)
            mask.append(0)
        
        tokenized_sentences.append(input_ids)
        attention_masks.append(mask)
        verb_labels.append(label)  # Note that this is a single label now, not a list
    
    return tokenized_sentences, attention_masks, verb_labels

train_encodings, train_masks, train_labels = preprocess_data(train_df)
eval_encodings, eval_masks, eval_labels = preprocess_data(eval_df)
test_encodings, test_masks, test_labels = preprocess_data(test_df)


### Load data for training

In [10]:
from datasets import Dataset

def create_hf_dataset(encodings, masks, labels):
    return Dataset.from_dict({
        'input_ids': encodings,
        'attention_mask': masks,
        'labels': labels
    })

train_dataset = create_hf_dataset(train_encodings, train_masks, train_labels)
eval_dataset = create_hf_dataset(eval_encodings, eval_masks, eval_labels)
test_dataset = create_hf_dataset(test_encodings, test_masks, test_labels)

### Training

In [11]:

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('kangela/Metaphor-FineTuned-BERT', num_labels=2)


# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # The actual number of epochs will be set by max_steps
    max_steps=8000,
    logging_dir='./logs',
    logging_steps=500,  # Log every 500 steps
    save_steps=1000,    # Save the model every 1000 steps
    evaluation_strategy="steps",  # Evaluate the model every logging_steps
    save_total_limit=2,  # Only last 2 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    remove_unused_columns=False,  # Important to set this to keep the 'labels' column
    output_dir="./metaphor_detection_model",
)

from transformers import DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print(results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kangela/Metaphor-FineTuned-BERT and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/8000 [05:46<?, ?it/s]
  0%|          | 0/8000 [00:00<?, ?it/s]

ValueError: Expected input batch_size (8) to match target batch_size (4096).

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification

# 1. Load the Data
dataset = load_dataset('csv', data_files={'train': 'data/VUA/VUA_formatted_train.csv', 'validation': 'data/VUA/VUA_formatted_val', 'test': 'data/VUA/VUA_formatted_test.csv'}, encoding='ISO-8859-1')

# 2. Tokenization & Preprocessing
tokenizer = AutoTokenizer.from_pretrained("kangela/Metaphor-FineTuned-BERT")

def encode(example_batch):
    sentences = example_batch['sentence']
    verb_indices = example_batch['verb_idx']
    labels = example_batch['label']

    encodings = tokenizer(sentences, truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True)

    label_encodings = []
    for offset_mapping, verb_idx in zip(encodings['offset_mapping'], verb_indices):
        label = [0] * 512 # initialize with 512 (this matches the padding='max_length' and max_length=512 above)
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= verb_idx < end:
                label[idx] = 1
        label_encodings.append(label)

    encodings['labels'] = label_encodings
    return encodings

dataset = dataset.map(encode, batched=True)

# 3. Training
model = AutoModelForSequenceClassification.from_pretrained("kangela/Metaphor-FineTuned-BERT", num_labels=2)

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

trainer.train()

# Evaluate on Test Data
results = trainer.evaluate(dataset["test"])



Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 25627.11it/s]

Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 732.08it/s]

Generating train split: 0 examples [00:00, ? examples/s]
  0%|          | 0/8000 [10:57<?, ?it/s]


DatasetGenerationError: An error occurred while generating the dataset