Creating a new notebook:
Here's an outline for a Jupyter notebook on Metaphor Detection using Machine Learning and RoBERTa:

* **Data Loading and Preprocessing**
* **Token-Level Data Creation**  
* **Baseline Logistic Regression Model**
* **Context-Enhanced Logistic Regression**
* **Advanced Classical ML Models**
* **RoBERTa Token Classification Dataset**
* **RoBERTa Model Training**
* **Class-Balanced RoBERTa Training**
* **Model Evaluation and Comparison**



## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

In [2]:
!pip install datasketch
from datasketch import MinHash, MinHashLSH
from datasets import Dataset

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5


In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight

In [4]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
import torch
import torch.nn as nn


In [5]:
import warnings
warnings.filterwarnings("ignore")

## Data Loading and Preprocessing

In [6]:
df = pd.read_parquet("0000.parquet") # read data into a pandas df
df.head()

Unnamed: 0,document_name,words,pos_tags,met_type,meta
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[AJS, AJ0, NN1, VVZ, AJ0, NN1, PUN, NP0, NP0, ...","[{'type': 'mrw/met', 'word_indices': [3]}, {'t...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
1,a1e-fragment01,"[By, FRANK, KANE]","[PRP, NP0, NP0-NN1]",[],"[N/A, N/A, N/A]"
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[PNP, VVZ, CJT, NP0, NP0, PUN, AT0, AJS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [16]}, {'...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[PNP, VHZ, XX0, AV0, VVN, AT0, NN1, POS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [6]}]","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[AT0, AJ0, NN1, PRF, NP0, NN2, PUN, PRP, DTQ, ...","[{'type': 'mrw/met', 'word_indices': [2]}, {'t...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."


In [7]:
df = df.iloc[:, :-1]  # dropping the last column
df = df[df['met_type'].apply(lambda x: len(x) > 0)] # keeping only sent with metaphors
df.shape

(8220, 4)

for the dataset description: http://www.vismet.org/metcor/manual/index.php

### detection

## RoBERTa token classification model for detection

## Sentence-level

doc-level dataframe → sentence-level dataframe

In [8]:
# create a copy of df
sent_df = df.copy()
sent_df.head()

Unnamed: 0,document_name,words,pos_tags,met_type
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[AJS, AJ0, NN1, VVZ, AJ0, NN1, PUN, NP0, NP0, ...","[{'type': 'mrw/met', 'word_indices': [3]}, {'t..."
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[PNP, VVZ, CJT, NP0, NP0, PUN, AT0, AJS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [16]}, {'..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[PNP, VHZ, XX0, AV0, VVN, AT0, NN1, POS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [6]}]"
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[AT0, AJ0, NN1, PRF, NP0, NN2, PUN, PRP, DTQ, ...","[{'type': 'mrw/met', 'word_indices': [2]}, {'t..."
5,a1e-fragment01,"[If, he, had, taken, his, own, rule, seriously...","[CJS, PNP, VHD, VVN, DPS, DT0, NN1, AV0, PUN, ...","[{'type': 'mrw/met', 'word_indices': [3]}, {'t..."


In [9]:
# Create a 'labels' column in df with lists of 0s and 1s
sent_df['labels'] = sent_df.apply(lambda row: [1 if any(idx in ann.get('word_indices', []) for ann in row['met_type']) else 0 for idx in range(len(row['words']))], axis=1)
sent_df=sent_df.drop(columns=['met_type'])

sent_df.head()

Unnamed: 0,document_name,words,pos_tags,labels
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[AJS, AJ0, NN1, VVZ, AJ0, NN1, PUN, NP0, NP0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[PNP, VVZ, CJT, NP0, NP0, PUN, AT0, AJS, NN1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[PNP, VHZ, XX0, AV0, VVN, AT0, NN1, POS, NN1, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[AT0, AJ0, NN1, PRF, NP0, NN2, PUN, PRP, DTQ, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."
5,a1e-fragment01,"[If, he, had, taken, his, own, rule, seriously...","[CJS, PNP, VHD, VVN, DPS, DT0, NN1, AV0, PUN, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
def get_simplified_metaphor_pos(pos_tag, label):
    if label == 0:
        return "na"
    else: # label is 1 (metaphorical)
        if pos_tag.startswith("V"):
            return "verb"
        elif pos_tag.startswith("N"):
            return "noun"
        elif pos_tag.startswith("AJ"):
            return "adj"
        else:
            return "other"

# Apply the function element-wise to the lists of pos_tags and labels
sent_df['simple_pos'] = sent_df.apply(
    lambda row: [get_simplified_metaphor_pos(pos, label) for pos, label in zip(row['pos_tags'], row['labels'])],
    axis=1
)

sent_df = sent_df.drop(columns=['pos_tags'])

# Display the first few rows with the new 'simplified_metaphor_pos' column
sent_df.head()

Unnamed: 0,document_name,words,labels,simple_pos
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[na, na, na, verb, na, noun, na, na, na, na, n..."
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[na, na, na, na, na, na, na, na, na, na, na, n..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[na, na, na, na, na, na, noun, na, na, na, na]"
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[na, na, noun, na, na, na, na, other, na, na, ..."
5,a1e-fragment01,"[If, he, had, taken, his, own, rule, seriously...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[na, na, na, verb, na, na, na, na, na, na, na,..."


In [11]:
# average sentence length
print(f"Average sentence length: {sent_df['words'].apply(len).mean():.2f} tokens")
# max and min
print(f"Max sentence length: {sent_df['words'].apply(len).max()} tokens")
print(f"Min sentence length: {sent_df['words'].apply(len).min()} tokens")

Average sentence length: 21.18 tokens
Max sentence length: 127 tokens
Min sentence length: 1 tokens


In [12]:
# Initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("distilroberta-base", add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

In [13]:
# class MetaphorSentenceDataset(Dataset):
#     def __init__(self, df, tokenizer, max_len=64):
#         self.df = df.reset_index(drop=True)
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#         self.encodings = {
#             'input_ids': [],
#             'attention_mask': [],
#             'labels': [],
#             'word_ids': [],
#             'simple_pos': []
#         }

#         # Mapping for simplified POS categories
#         self.simple_pos_mapping = {
#             'na': 0, 'verb': 1, 'noun': 2, 'adj': 3, 'other': 4,
#             "SPECIAL": -1, "SUBWORD": -1, "UNKNOWN_POS": -1, "PAD": -1
#         }

#         for _, row in self.df.iterrows():
#             words = [str(w) for w in row["words"]]
#             labels = [int(l) for l in row["labels"]]
#             simple_pos = [str(p) for p in row["simple_pos"]]

#             # tokenize
#             encoding = self.tokenizer(
#                 words,
#                 is_split_into_words=True,
#                 padding="max_length",
#                 truncation=True,
#                 max_length=self.max_len,
#                 return_tensors="pt"
#             )
#             word_ids = encoding.word_ids(batch_index=0)

#             aligned_labels, aligned_pos = [], []
#             for i, word_id in enumerate(word_ids):
#                 if word_id is None:
#                     aligned_labels.append(-100)
#                     aligned_pos.append(self.simple_pos_mapping["SPECIAL"])
#                 else:
#                     aligned_labels.append(labels[word_id])
#                     aligned_pos.append(self.simple_pos_mapping.get(simple_pos[word_id],
#                                                                   self.simple_pos_mapping["UNKNOWN_POS"]))

#             self.encodings['input_ids'].append(encoding['input_ids'].squeeze(0))
#             self.encodings['attention_mask'].append(encoding['attention_mask'].squeeze(0))
#             self.encodings['labels'].append(torch.tensor(aligned_labels, dtype=torch.long))
#             self.encodings['word_ids'].append(torch.tensor([wid if wid is not None else -1 for wid in word_ids], dtype=torch.long))
#             self.encodings['simple_pos'].append(torch.tensor(aligned_pos, dtype=torch.long))

#     def __len__(self):
#         return len(self.encodings['input_ids'])

#     def __getitem__(self, idx):
#         return {key: self.encodings[key][idx] for key in self.encodings}


In [40]:
from torch.utils.data import Dataset
import torch

class MetaphorSentenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Encodings for Trainer
        self.encodings = {
            'input_ids': [],
            'attention_mask': [],
            'labels': []
        }

        # Separate lists to use later for evaluation
        self.word_ids_list = []
        self.simple_pos_list = []

        # POS mapping (simplified)
        self.simple_pos_mapping = {'na': 'na', 'verb': 'verb', 'noun': 'noun',
                                   'adj': 'adj', 'other': 'other', 'SPECIAL': 'SPECIAL',
                                   'SUBWORD': 'SUBWORD', 'UNKNOWN_POS': 'UNKNOWN_POS', 'PAD': 'PAD'}

        for idx, row in self.df.iterrows():
            words = [str(w) for w in row["words"]]
            labels = [int(l) for l in row["labels"]]
            simple_pos = [str(p) for p in row["simple_pos"]]

            encoding = self.tokenizer(
                words,
                is_split_into_words=True,
                padding="max_length",
                truncation=True,
                max_length=self.max_len,
                return_tensors="pt"
            )

            word_ids = encoding.word_ids(batch_index=0)
            aligned_labels = []
            aligned_pos = []

            prev_word_id = None
            for i, word_id in enumerate(word_ids):
                if word_id is None:
                    aligned_labels.append(-100)
                    aligned_pos.append("SPECIAL")
                elif word_id != prev_word_id:
                    aligned_labels.append(labels[word_id])
                    aligned_pos.append(simple_pos[word_id])
                    prev_word_id = word_id
                else:
                    aligned_labels.append(-100)
                    aligned_pos.append("SUBWORD")

            # Convert to tensors
            self.encodings['input_ids'].append(encoding['input_ids'].squeeze(0))
            self.encodings['attention_mask'].append(encoding['attention_mask'].squeeze(0))
            self.encodings['labels'].append(torch.tensor(aligned_labels, dtype=torch.long))

            # Save for evaluation
            self.word_ids_list.append(word_ids)
            self.simple_pos_list.append(aligned_pos)

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # Only keys Trainer expects
        return {key: self.encodings[key][idx] for key in self.encodings}


In [41]:
# # Example alignment:
# Original words: ["running", "quickly", "home"]
# Original labels: [1, 0, 0]  # "running" is metaphorical

# # After tokenization:
# Tokens: ["<s>", "running", "quickly", "home", "</s>", "<pad>", "<pad>"]
# Labels: [-100, 1, 0, 0, -100, -100, -100]
# #        ^     ^  ^  ^   ^     ^      ^
# #        |     |  |  |   |     |      └─ padding
# #        |     |  |  |   |     └─ padding
# #        |     |  |  |   └─ end token (ignore)
# #        |     |  |  └─ "home" (literal)
# #        |     |  └─ "quickly" (literal)
# #        |     └─ "running" (metaphor)
# #        └─ start token (ignore)

Original word: "running" → Subwords: ["run", "ning"]
We need to decide which subword tokens get which labels

Decision Rules:

Special tokens (word_id is None) → -100 (ignored in loss)
First subword of each word → Use original label
Additional subwords of same word → -100 (ignored)

### Train test split

In [42]:
sent_df.head()

Unnamed: 0,document_name,words,labels,simple_pos
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[na, na, na, verb, na, noun, na, na, na, na, n..."
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[na, na, na, na, na, na, na, na, na, na, na, n..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[na, na, na, na, na, na, noun, na, na, na, na]"
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[na, na, noun, na, na, na, na, other, na, na, ..."
5,a1e-fragment01,"[If, he, had, taken, his, own, rule, seriously...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[na, na, na, verb, na, na, na, na, na, na, na,..."


In [43]:
# Split by documents to prevent leakage
doc_ids_all = sent_df["document_name"].unique()

# First split into train and temp (validation + test) document IDs
train_ids, temp_ids = train_test_split(doc_ids_all, test_size=0.3, random_state=42) # e.g., 70% train, 30% temp

# Split temp into validation and test document IDs
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42) # e.g., 15% validation, 15% test

# Create sentence-level DataFrames for train, validation, and test using filtered document IDs
train_df_all = sent_df[sent_df["document_name"].isin(train_ids)].copy().reset_index(drop=True)
val_df_all   = sent_df[sent_df["document_name"].isin(val_ids)].copy().reset_index(drop=True)
test_df_all  = sent_df[sent_df["document_name"].isin(test_ids)].copy().reset_index(drop=True)

print(f"Train size (all sentences): {len(train_df_all)}")
print(f"Val size (all sentences): {len(val_df_all)}")
print(f"Test size (all sentences): {len(test_df_all)}")

Train size (all sentences): 5746
Val size (all sentences): 1419
Test size (all sentences): 1055


In [44]:
# Sample from the filtered and split dataframes
train_df_sampled = train_df_all.sample(n=2000, random_state=42).reset_index(drop=True)
val_df_sampled   = val_df_all.sample(n=500, random_state=42).reset_index(drop=True)
test_df_sampled  = test_df_all.sample(n=500, random_state=42).reset_index(drop=True)

print(f"Train size: {len(train_df_sampled)}")
print(f"Val size: {len(val_df_sampled)}")
print(f"Test size: {len(test_df_sampled)}")

# Create the datasets from the sampled dataframes
train_dataset = MetaphorSentenceDataset(train_df_sampled, tokenizer, max_len=32)
val_dataset = MetaphorSentenceDataset(val_df_sampled, tokenizer, max_len=32)
test_dataset = MetaphorSentenceDataset(test_df_sampled, tokenizer, max_len=32)

Train size: 2000
Val size: 500
Test size: 500


In [56]:
# display test_dataset
test_dataset[0]

{'input_ids': tensor([    0, 18223,  1415,   358, 10468,    10,  1421,   209,   360,  2156,
          1586, 24398,   802,    19,    10,  2842,     9, 32130, 17707,  7040,
           479,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([-100,    0,    0,    0,    0,    0,    1,    1,    0,    0,    0, -100,
            0,    1,    0,    1,    0,    1, -100,    0,    0, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100])}

## Class balancing

In [45]:

# Calculate class weights for imbalanced data
train_labels_flat = [label for labels in train_df_all["labels"] for label in labels]
unique_classes = np.unique(train_labels_flat)
class_weights = compute_class_weight('balanced', classes=unique_classes, y=train_labels_flat)

print(f"Class distribution in training data:")
print(f"Literal (0): {train_labels_flat.count(0)} tokens ({100*train_labels_flat.count(0)/len(train_labels_flat):.1f}%)")
print(f"Metaphor (1): {train_labels_flat.count(1)} tokens ({100*train_labels_flat.count(1)/len(train_labels_flat):.1f}%)")
print(f"Enhanced class weights: Literal={class_weights[0]:.2f}, Metaphor={class_weights[1]:.2f}")


Class distribution in training data:
Literal (0): 100946 tokens (85.0%)
Metaphor (1): 17820 tokens (15.0%)
Enhanced class weights: Literal=0.59, Metaphor=3.33


### Custom Trainer with Weighted Loss

In [46]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.FloatTensor(class_weights)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Enhanced compute_loss with weighted cross entropy for class imbalance"""
        expected_keys = ["input_ids", "attention_mask", "labels"]
        model_inputs = {key: inputs[key] for key in expected_keys if key in inputs}

        labels = model_inputs.get("labels") # Get labels separately as they are used in loss calculation

        # Pass only expected inputs to the model
        outputs = model(**model_inputs)
        logits = outputs.get("logits")

        # Move class weights to correct device
        device = logits.device
        class_weights_device = self.class_weights.to(device)

        # Create weighted cross entropy loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_device, ignore_index=-100)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [47]:
# Enhanced compute metrics with detailed class-specific metrics
def compute_metrics_enhanced(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)

    # Remove ignored index (-100 values)
    predictions = predictions[labels != -100].flatten()
    labels = labels[labels != -100].flatten()

    # Calculate metrics for both classes
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=0)
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': weighted_f1,
        'precision': weighted_precision,
        'recall': weighted_recall,
        'literal_f1': f1[0] if len(f1) > 0 else 0.0,
        'metaphor_f1': f1[1] if len(f1) > 1 else 0.0,
        'literal_precision': precision[0] if len(precision) > 0 else 0.0,
        'metaphor_precision': precision[1] if len(precision) > 1 else 0.0,
        'literal_recall': recall[0] if len(recall) > 0 else 0.0,
        'metaphor_recall': recall[1] if len(recall) > 1 else 0.0,
    }

1636a7c4bf2237ac398dd011cd9d5f7804e65863


In [48]:
import os
from google.colab import userdata

os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

In [49]:

# Create model and training configuration
model_balanced = RobertaForTokenClassification.from_pretrained("distilroberta-base", num_labels=2)

training_args_bal_fast = TrainingArguments(
    output_dir="./metaphor_model_fast_balanced",
    eval_strategy="epoch",           # ✅ Skip all evaluation during training
    save_strategy="epoch",           # ✅ Skip saving checkpoints
    learning_rate=2e-5,           # ✅ Higher learning rate (2x faster convergence)
    per_device_train_batch_size=64,  # ✅ Larger batches (if memory allows)
    per_device_eval_batch_size=64,
    num_train_epochs=2,           # ✅ Just 1 epoch
    weight_decay=0.01,
    logging_steps=25,             # ✅ Less frequent logging
    warmup_steps=0,               # ✅ No warmup (saves time)
    dataloader_num_workers=4,     # ✅ More parallel data loading
    fp16=True,                    # ✅ Mixed precision (faster + less memory)
    remove_unused_columns=False,  # ✅ Skip column validation
    dataloader_drop_last=True,    # ✅ Skip partial batches
)

# training_args_balanced = TrainingArguments(
#     output_dir="./metaphor_model_balanced",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_dir="./logs_balanced",
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_metaphor_f1",
#     greater_is_better=True,
#     warmup_steps=100,
# )

# Initialize weighted trainer
trainer_balanced = WeightedTrainer(
    class_weights=class_weights,
    model=model_balanced,
    args=training_args_bal_fast,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_enhanced,
)

# Train the balanced model
print("🚀 Training class-balanced RoBERTa model...")
trainer_balanced.train()

print("✅ Class-balanced training completed with enhanced metaphor detection")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Training class-balanced RoBERTa model...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Literal F1,Metaphor F1,Literal Precision,Metaphor Precision,Literal Recall,Metaphor Recall
1,0.6011,0.439133,0.795698,0.818976,0.874572,0.795698,0.868097,0.547116,0.95779,0.414002,0.793764,0.806397
2,0.4161,0.387233,0.807806,0.830147,0.888545,0.807806,0.875438,0.579481,0.970387,0.435593,0.797414,0.86532


✅ Class-balanced training completed with enhanced metaphor detection


In [66]:

predictions = trainer_balanced.predict(test_dataset)
y_pred_logits = predictions.predictions
y_pred = y_pred_logits.argmax(axis=-1)
mask = predictions.label_ids != -100  # Only consider non-ignored tokens
y_true_clean = predictions.label_ids[mask].flatten()
y_pred_clean = y_pred[mask].flatten()

print("Overall Performance (Class-Balanced Model):")
print(classification_report(y_true_clean, y_pred_clean, target_names=['Literal', 'Metaphor'], zero_division=0))

aligned_pos_clean = []

for i, mask_row in enumerate(mask):
    word_ids = test_dataset.word_ids_list[i]
    simple_pos = test_dataset.simple_pos_list[i]

    aligned_pos_clean.extend([simple_pos[j] for j, m in enumerate(mask_row) if m])



pos_comparisons = {
    "Noun vs. Literal": ["noun", "na"],
    "Adjective vs. Literal": ["adj", "na"],
    "Verb vs. Literal": ["verb", "na"],
    "Other vs. Literal": ["other", "na"]
}

for comp_name, categories in pos_comparisons.items():
    comparison_mask = np.array([pos in categories for pos in aligned_pos_clean])

    if comparison_mask.sum() > 0:
        y_true_sub = y_true_clean[comparison_mask]
        y_pred_sub = y_pred_clean[comparison_mask]

        print(f"\n Classification Report for '{comp_name}'")
        print(classification_report(y_true_sub, y_pred_sub, target_names=['Literal', 'Metaphor'], zero_division=0))
    else:
        print(f"Comparison '{comp_name}': No relevant tokens found.")

Overall Performance (Class-Balanced Model):
              precision    recall  f1-score   support

     Literal       0.94      0.90      0.92      6575
    Metaphor       0.55      0.65      0.60      1188

    accuracy                           0.87      7763
   macro avg       0.74      0.78      0.76      7763
weighted avg       0.88      0.87      0.87      7763


--- Classification Report for 'Noun vs. Literal' ---
              precision    recall  f1-score   support

     Literal       0.97      0.90      0.94      6575
    Metaphor       0.15      0.38      0.21       282

    accuracy                           0.88      6857
   macro avg       0.56      0.64      0.57      6857
weighted avg       0.94      0.88      0.91      6857


--- Classification Report for 'Adjective vs. Literal' ---
              precision    recall  f1-score   support

     Literal       0.99      0.90      0.94      6575
    Metaphor       0.04      0.24      0.06        98

    accuracy             

The model is very good at catching metaphors (recall 0.89) → it rarely misses them.

But precision is lower (0.52) → it often marks literal tokens as metaphor.

This is expected in imbalanced token-level setups: the model learns a “when in doubt, say metaphor” bias.

## Fast hyperparameter search

In [51]:
def quick_evaluate_balance(weights, train_size=1000, val_size=400):
    """Quickly evaluate class weights on small samples."""

    # Tiny training and validation datasets
    tiny_train_dataset = MetaphorSentenceDataset(
        train_df_sampled.sample(train_size, random_state=42), tokenizer, max_len=32
    )
    tiny_val_dataset = MetaphorSentenceDataset(
        val_df_sampled.sample(val_size, random_state=42), tokenizer, max_len=32
    )

    # Ultra-fast training
    quick_args = TrainingArguments(
        output_dir="./temp_model",
        eval_strategy="no",
        save_strategy="no",
        learning_rate=1e-4,
        per_device_train_batch_size=32,
        num_train_epochs=1,
        logging_steps=999999,
        warmup_steps=0,
        fp16=True
    )

    model = RobertaForTokenClassification.from_pretrained("distilroberta-base", num_labels=2)
    trainer = WeightedTrainer(
        class_weights=weights,
        model=model,
        args=quick_args,
        train_dataset=tiny_train_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    # Evaluate
    preds = trainer.predict(tiny_val_dataset)
    y_pred = preds.predictions.argmax(axis=-1)
    mask = preds.label_ids != -100
    y_true, y_pred_clean = preds.label_ids[mask], y_pred[mask]

    # Return metaphor F1
    f1_scores = precision_recall_fscore_support(y_true, y_pred_clean, average=None, zero_division=0)[2]
    return f1_scores[1] if len(f1_scores) > 1 else 0


# Test different class weight multipliers
balance_results = {}
for m in [0.3, 0.4, 0.5, 0.6]:
    balance_results[m] = quick_evaluate_balance([class_weights[0], class_weights[1] * m])
    print(f"Multiplier {m}: Metaphor F1 = {balance_results[m]:.3f}")

best_multiplier = max(balance_results, key=balance_results.get)
final_weights = [class_weights[0], class_weights[1] * best_multiplier]
print(f"Best balance multiplier: {best_multiplier}")


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Multiplier 0.3: Metaphor F1 = 0.644


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Multiplier 0.4: Metaphor F1 = 0.660


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Multiplier 0.5: Metaphor F1 = 0.652


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Multiplier 0.6: Metaphor F1 = 0.637
Best balance multiplier: 0.4


In [52]:

# Create model and training configuration
model_balanced = RobertaForTokenClassification.from_pretrained("distilroberta-base", num_labels=2)

training_args_bal_fast = TrainingArguments(
    output_dir="./metaphor_model_fast_balanced",
    eval_strategy="epoch",           # ✅ Skip all evaluation during training
    save_strategy="epoch",           # ✅ Skip saving checkpoints
    learning_rate=2e-5,           # ✅ Higher learning rate (2x faster convergence)
    per_device_train_batch_size=64,  # ✅ Larger batches (if memory allows)
    per_device_eval_batch_size=64,
    num_train_epochs=2,           # ✅ Just 1 epoch
    weight_decay=0.01,
    logging_steps=25,             # ✅ Less frequent logging
    warmup_steps=0,               # ✅ No warmup (saves time)
    dataloader_num_workers=4,     # ✅ More parallel data loading
    fp16=True,                    # ✅ Mixed precision (faster + less memory)
    remove_unused_columns=False,  # ✅ Skip column validation
    dataloader_drop_last=True,    # ✅ Skip partial batches
)

# Initialize weighted trainer
trainer_balanced = WeightedTrainer(
    class_weights=final_weights,
    model=model_balanced,
    args=training_args_bal_fast,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_enhanced,
)

# Train the balanced model
print("🚀 Training class-balanced RoBERTa model...")
trainer_balanced.train()

print("✅ Class-balanced training completed with enhanced metaphor detection")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Training class-balanced RoBERTa model...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Literal F1,Metaphor F1,Literal Precision,Metaphor Precision,Literal Recall,Metaphor Recall
1,0.5559,0.410999,0.865109,0.860557,0.857253,0.865109,0.921766,0.510917,0.910069,0.555556,0.933767,0.472918
2,0.3948,0.35006,0.866916,0.872914,0.882052,0.866916,0.920026,0.603801,0.94149,0.542526,0.899519,0.680679


✅ Class-balanced training completed with enhanced metaphor detection


In [64]:
preds_out = trainer_balanced.predict(test_dataset)
logits = preds_out.predictions           # (N, L, C)
y_pred = logits.argmax(axis=-1)          # (N, L)
y_true = preds_out.label_ids             # (N, L)

rows = []
n_samples = y_pred.shape[0]

for i in range(n_samples):
    word_ids = test_dataset.word_ids_list[i]
    pos_seq = test_dataset.simple_pos_list[i]
    sentence_words = test_dataset.df.loc[i, "words"]
    input_ids = test_dataset.encodings["input_ids"][i].tolist()

    seq_len = min(len(word_ids), y_pred.shape[1], len(input_ids), len(y_true[i]))
    for j in range(seq_len):
        if y_true[i][j] == -100:
            continue

        wid = word_ids[j]
        if wid is None or not (0 <= wid < len(sentence_words)):
            continue

        rows.append({
            "token": tokenizer.decode([int(input_ids[j])]).strip(),
            "original_word": sentence_words[wid],
            "pos": pos_seq[j],
            "true_label": "Metaphor" if int(y_true[i][j]) == 1 else "Literal",
            "pred_label": "Metaphor" if int(y_pred[i][j]) == 1 else "Literal",
            "sentence": " ".join(sentence_words)
        })

df_tokens = pd.DataFrame(rows)

# 🔹 Keep only true metaphors
df_tokens = df_tokens[df_tokens["true_label"] == "Metaphor"].reset_index(drop=True)

print(f"Collected {len(df_tokens)} metaphor tokens.")
display(df_tokens.sample(min(10, len(df_tokens))))

Collected 1188 metaphor tokens.


Unnamed: 0,token,original_word,pos,true_label,pred_label,sentence
55,get,get,verb,Metaphor,Metaphor,From eqn ( 3.26 ) we get [formula ] .
14,war,warily,other,Metaphor,Metaphor,Avon treads warily .
144,light,light,noun,Metaphor,Literal,Oh ! ’ she squealed as light dawned .
715,to,to,other,Metaphor,Metaphor,You might expect them to rush to La Mama 's aid .
366,maintained,maintained,verb,Metaphor,Metaphor,And though some knights might be supplied free...
1173,about,about,other,Metaphor,Metaphor,But there is nothing homely about the general ...
701,that,that,other,Metaphor,Literal,In that kind of free market he thrived and sur...
1057,keen,keen,adj,Metaphor,Literal,"When you 're feeling keen I said , you can do ..."
778,breadth,breadth,noun,Metaphor,Metaphor,Rowing was transformed in the breadth and dept...
977,as,as if,other,Metaphor,Literal,"This trick took hours of practice , up and dow..."


## Evaluate Final Model on Test Set

Now that the final model has been trained, evaluate its performance on the held-out test set (`test_dataset`) to get an unbiased measure of its generalization ability.

Use the evaluation code in cell `b80c325b`.

In [68]:
import numpy as np
from sklearn.metrics import classification_report

# Get predictions
preds_out = trainer_balanced.predict(test_dataset)
logits = preds_out.predictions
y_pred = logits.argmax(axis=-1)
y_true = preds_out.label_ids
mask = y_true != -100

# Cleaned arrays (only real tokens)
y_true_clean = y_true[mask].flatten()
y_pred_clean = y_pred[mask].flatten()

print("Overall Performance (Class-Balanced Model):")
print(classification_report(y_true_clean, y_pred_clean, target_names=['Literal', 'Metaphor'], zero_division=0))

# --- Align POS tags with non-ignored tokens ---
aligned_pos_clean = []
for i, mask_row in enumerate(mask):
    word_ids = test_dataset.word_ids_list[i]
    pos_tags = test_dataset.simple_pos_list[i]

    aligned_pos_clean.extend([pos_tags[j] for j, m in enumerate(mask_row) if m])

# --- POS-Conditioned Classification Reports ---
pos_comparisons = {
    "Noun vs. Literal": ["noun", "na"],
    "Adjective vs. Literal": ["adj", "na"],
    "Verb vs. Literal": ["verb", "na"],
    "Other vs. Literal": ["other", "na"]
}

for comp_name, categories in pos_comparisons.items():
    comparison_mask = np.array([pos in categories for pos in aligned_pos_clean])

    if comparison_mask.sum() > 0:
        y_true_sub = y_true_clean[comparison_mask]
        y_pred_sub = y_pred_clean[comparison_mask]

        print(f"\nClassification Report for '{comp_name}'")
        print(classification_report(y_true_sub, y_pred_sub, target_names=['Literal', 'Metaphor'], zero_division=0))
    else:
        print(f"Comparison '{comp_name}': No relevant tokens found.")


Overall Performance (Class-Balanced Model):
              precision    recall  f1-score   support

     Literal       0.94      0.90      0.92      6575
    Metaphor       0.55      0.65      0.60      1188

    accuracy                           0.87      7763
   macro avg       0.74      0.78      0.76      7763
weighted avg       0.88      0.87      0.87      7763


Classification Report for 'Noun vs. Literal'
              precision    recall  f1-score   support

     Literal       0.97      0.90      0.94      6575
    Metaphor       0.15      0.38      0.21       282

    accuracy                           0.88      6857
   macro avg       0.56      0.64      0.57      6857
weighted avg       0.94      0.88      0.91      6857


Classification Report for 'Adjective vs. Literal'
              precision    recall  f1-score   support

     Literal       0.99      0.90      0.94      6575
    Metaphor       0.04      0.24      0.06        98

    accuracy                           0.

In [65]:
## Evaluate on test set
predictions_balanced = trainer_balanced.predict(test_dataset)
y_pred_logits_balanced = predictions_balanced.predictions
y_pred_balanced = y_pred_logits_balanced.argmax(axis=-1)

# Filter out ignored tokens
mask_balanced = predictions_balanced.label_ids != -100
y_pred_clean_balanced = y_pred_balanced[mask_balanced]
y_true_clean_balanced = predictions_balanced.label_ids[mask_balanced]

print("Overall Performance (Class-Balanced Model):")
print(classification_report(y_true_clean_balanced, y_pred_clean_balanced, target_names=['Literal', 'Metaphor']))

Overall Performance (Class-Balanced Model):
              precision    recall  f1-score   support

     Literal       0.94      0.90      0.92      6575
    Metaphor       0.55      0.65      0.60      1188

    accuracy                           0.87      7763
   macro avg       0.74      0.78      0.76      7763
weighted avg       0.88      0.87      0.87      7763



## No class balancing

1636a7c4bf2237ac398dd011cd9d5f7804e65863

In [None]:
import os
from google.colab import userdata

os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

In [None]:
model = RobertaForTokenClassification.from_pretrained("distilroberta-base", num_labels=2)

training_args_fast = TrainingArguments(
    output_dir="./metaphor_model_fast",
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",   # Save model at the end of each epoch
    learning_rate=2e-5,   # Higher learning rate for faster convergence
    per_device_train_batch_size=64,  # Increased batch size
    per_device_eval_batch_size=64, # Increased batch size
    num_train_epochs=2, # Increase epochs, as early stopping will handle duration
    weight_decay=0.01,
    logging_dir="./logs_balanced",
    logging_steps=100,    # Less frequent logging
    warmup_steps=50,      # Fewer warmup steps
    dataloader_num_workers=4,  # Increased parallel data loading
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1", # Or eval_metaphor_f1 if using enhanced metrics
    greater_is_better=True,
    fp16=True,            # Mixed precision for speed
    remove_unused_columns=False, # Added to prevent Trainer from trying to remove columns
)

# training_args = TrainingArguments(
#     output_dir="./metaphor_model",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
# #     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=50,
# )

# Note: You also need to define compute_metrics function
def compute_metrics(eval_pred):


    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)

    # Remove ignored index (our -100 values)
    predictions = predictions[labels != -100].flatten()
    labels = labels[labels != -100].flatten()

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


trainer = Trainer( # Use the standard Trainer here
    model=model,
    args=training_args_fast,
    train_dataset=train_dataset, # Use the tokenized dataset
    eval_dataset=test_dataset,   # Use the tokenized dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # from before
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Increased patience slightly
)

trainer.train()

In [None]:
model = RobertaForTokenClassification.from_pretrained("distilroberta-base", num_labels=2)

training_args_bal_fast = TrainingArguments(
    output_dir="./metaphor_model_fast_balanced",
    eval_strategy="epoch",           # ✅ Skip all evaluation during training
    save_strategy="epoch",           # ✅ Skip saving checkpoints
    learning_rate=2e-5,           # ✅ Higher learning rate (2x faster convergence)
    per_device_train_batch_size=64,  # ✅ Larger batches (if memory allows)
    per_device_eval_batch_size=64,
    num_train_epochs=2,           # ✅ Just 1 epoch
    weight_decay=0.01,
    logging_steps=25,             # ✅ Less frequent logging
    warmup_steps=0,               # ✅ No warmup (saves time)
    dataloader_num_workers=4,     # ✅ More parallel data loading
    fp16=True,                    # ✅ Mixed precision (faster + less memory)
    remove_unused_columns=False,  # ✅ Skip column validation
    dataloader_drop_last=True,    # ✅ Skip partial batches
)

# training_args_balanced = TrainingArguments(
#     output_dir="./metaphor_model_balanced",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_dir="./logs_balanced",
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_metaphor_f1",
#     greater_is_better=True,
#     warmup_steps=100,
# )

# Initialize weighted trainer
trainer_balanced = Trainer(
    model=model,
    args=training_args_bal_fast,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_enhanced,
)

# Train the balanced model
print("🚀 Training class-balanced RoBERTa model...")
trainer_balanced.train()

print("✅ Class-balanced training completed with enhanced metaphor detection")

In [None]:

# labels = [1 if len(mets)>0 else 0 for mets in train_df["labels"]]
# np.mean(labels)  # proportion of metaphor labels

In [None]:
# Calculate the proportion of sentences with metaphors in the full training data (after doc split)
full_sentence_labels = train_df_all['has_metaphor']
# Calculate the proportion of sentences with metaphors in the sampled training data
sample_sentence_labels = train_df_sampled['has_metaphor']


print(f"Full training data (after doc split): {np.mean(full_sentence_labels):.3f} ({sum(full_sentence_labels)}/{len(full_sentence_labels)})")
print(f"Sampled training data ({len(train_df_sampled)} sentences): {np.mean(sample_sentence_labels):.3f} ({sum(sample_sentence_labels)}/{len(sample_sentence_labels)})")

In [None]:
# y_true = true metaphor labels per token (0 or 1)
y_true = [label for labels in train_df_all["labels"] for label in labels]

In [None]:
# Simple overall performance
predictions = trainer.predict(test_dataset)
y_pred_logits = predictions.predictions
y_pred = y_pred_logits.argmax(axis=-1)

# Filter out ignored tokens
mask = predictions.label_ids != -100
y_pred_clean = y_pred[mask]
y_true_clean = predictions.label_ids[mask]

from sklearn.metrics import classification_report
print("Overall Performance:")
print(classification_report(y_true_clean, y_pred_clean, target_names=['Literal', 'Metaphor']))