In [1]:
import os
import json
import torch
import numpy as np
import glob

from collections import Counter
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch.nn import CrossEntropyLoss
from transformers import DataCollatorForTokenClassification, RobertaPreTrainedModel, RobertaModel
from transformers.modeling_outputs import TokenClassifierOutput


from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification, # Added this import
    TrainingArguments,             # Added this import
    Trainer,                        # Added this import
)

from torch.utils.data import Dataset
import nltk


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "vua_dataset"
model_name = "roberta-base"

In [3]:
# Download NLTK punkt tokenizer data if you haven't already
# This block should be executed successfully before nltk.word_tokenize is used.
try:
    nltk.data.find('tokenizers/punkt')
except LookupError: # Catching LookupError as it's the specific error for resource not found
    print("NLTK 'punkt' tokenizer data not found. Downloading...")
    nltk.download('punkt', quiet=True) # Use quiet=True to suppress progress bar if preferred
    print("NLTK 'punkt' tokenizer data downloaded.")
except Exception as e:
    print(f"An unexpected error occurred during NLTK data check/download: {e}")

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("NLTK 'averaged_perceptron_tagger' not found. Downloading...")
    nltk.download('averaged_perceptron_tagger', quiet=True)
    print("NLTK 'averaged_perceptron_tagger' downloaded.")
except Exception as e:
    print(f"An unexpected error occurred during NLTK data check/download: {e}")


In [4]:
# if model_name is something like "roberta-base"
if "roberta" in model_name.lower():
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,
        add_prefix_space=True,  # required for pre-tokenized input with RoBERTa
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True
    )


In [5]:
def load_and_process_data_with_all_features(json_path):
    """
    Loads raw data from a JSONL file, groups it by sentence,
    and processes it to include both POS and FGPOS tags.

    Args:
        json_path (str): The path to the JSONL data file.

    Returns:
        list: A list of dictionaries, each containing "sentence_words", "labels", 
              "pos_tags", and "fgpos_tags".
        set: A set of all unique POS tags.
        set: A set of all unique FGPOS tags.
    """
    data_raw = []
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            data_raw.append(json.loads(line))

    sentence_groups = defaultdict(list)
    for entry in data_raw:
        sentence_groups[entry["sentence"]].append(entry)

    processed_data = []
    all_pos_tags = set()
    all_fgpos_tags = set()
    for sentence, entries in sentence_groups.items():
        entries = sorted(entries, key=lambda x: x["w_index"])
        
        original_words = sentence.split(' ')
        words_for_model = [original_words[e['w_index']] for e in entries]
        
        current_labels = [entry["label"] for entry in entries]
        pos_tags_for_sentence = [entry["POS"] for entry in entries]
        fgpos_tags_for_sentence = [entry["FGPOS"] for entry in entries]
        
        all_pos_tags.update(pos_tags_for_sentence)
        all_fgpos_tags.update(fgpos_tags_for_sentence)

        processed_data.append({
            "sentence_words": words_for_model, 
            "labels": current_labels,
            "pos_tags": pos_tags_for_sentence,
            "fgpos_tags": fgpos_tags_for_sentence
        })

    return processed_data, all_pos_tags, all_fgpos_tags

In [6]:
# --- Load and process TRAIN data ---
train_json_path = os.path.join("vua_dataset", "vua20_metaphor_train.json")
processed_train_data, train_pos_tags, train_fgpos_tags = load_and_process_data_with_all_features(train_json_path)

# --- Load and process TEST data ---
test_json_path = os.path.join("vua_dataset", "vua20_metaphor_test.json")
processed_test_data, test_pos_tags, test_fgpos_tags = load_and_process_data_with_all_features(test_json_path)

# --- Create POS tag vocabulary ---
all_pos_tags = sorted(list(train_pos_tags.union(test_pos_tags)))
pos2id = {tag: i for i, tag in enumerate(all_pos_tags)}
pos_vocab_size = len(pos2id)

# --- Create FGPOS tag vocabulary ---
all_fgpos_tags = sorted(list(train_fgpos_tags.union(test_fgpos_tags)))
fgpos2id = {tag: i for i, tag in enumerate(all_fgpos_tags)}
fgpos_vocab_size = len(fgpos2id)

print(f"POS vocabulary size: {pos_vocab_size}")
print(f"FGPOS vocabulary size: {fgpos_vocab_size}")
print(f"Number of training samples: {len(processed_train_data)}")
print(f"Number of test samples: {len(processed_test_data)}")

POS vocabulary size: 17
FGPOS vocabulary size: 41
Number of training samples: 10909
Number of test samples: 3601


In [7]:
def pre_tokenize_and_align_data(data, tokenizer, pos2id, fgpos2id):
    """
    Pre-processes the entire dataset by tokenizing and aligning labels,
    POS tags, and FGPOS tags.
    """
    tokenized_data = []
    for entry in data:
        sentence_words = entry["sentence_words"]
        word_labels = entry["labels"]
        word_pos_tags = entry["pos_tags"]
        word_fgpos_tags = entry["fgpos_tags"]

        raw_encoding = tokenizer(
            sentence_words,
            truncation=True,
            padding="max_length",
            max_length=128,
            is_split_into_words=True,
        )

        word_ids = raw_encoding.word_ids(batch_index=0)

        labels = []
        pos_ids = []
        fgpos_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)
                pos_ids.append(-100)
                fgpos_ids.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(word_labels[word_idx])
                pos_ids.append(pos2id[word_pos_tags[word_idx]])
                fgpos_ids.append(fgpos2id[word_fgpos_tags[word_idx]])
            else:
                labels.append(-100)
                pos_ids.append(-100)
                fgpos_ids.append(-100)
            previous_word_idx = word_idx

        encoding = {k: torch.tensor(v) for k, v in raw_encoding.items()}
        encoding["labels"] = torch.tensor(labels, dtype=torch.long)
        encoding["pos_tag_ids"] = torch.tensor(pos_ids, dtype=torch.long)
        encoding["fgpos_tag_ids"] = torch.tensor(fgpos_ids, dtype=torch.long)
        
        tokenized_data.append(encoding)
        
    return tokenized_data

# --- Pre-tokenize the datasets ---
print("Pre-tokenizing datasets...")
train_tokenized_data = pre_tokenize_and_align_data(processed_train_data, tokenizer, pos2id, fgpos2id)
test_tokenized_data = pre_tokenize_and_align_data(processed_test_data, tokenizer, pos2id, fgpos2id)
print("Pre-tokenizing complete.")

class MetaphorDatasetWithAllFeatures(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]
        
# --- Create Datasets from pre-tokenized data ---
train_dataset = MetaphorDatasetWithAllFeatures(train_tokenized_data)
test_dataset = MetaphorDatasetWithAllFeatures(test_tokenized_data)

Pre-tokenizing datasets...
Pre-tokenizing complete.
Pre-tokenizing complete.


In [8]:
class RobertaForTokenClassificationWithAllPOS(RobertaPreTrainedModel):
    def __init__(self, config, pos_vocab_size, fgpos_vocab_size, pos_embedding_dim=50, fgpos_embedding_dim=50):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        self.pos_embedding = torch.nn.Embedding(pos_vocab_size, pos_embedding_dim)
        self.fgpos_embedding = torch.nn.Embedding(fgpos_vocab_size, fgpos_embedding_dim)

        # The input to the classifier is the concatenation of RoBERTa's output and all POS embeddings
        classifier_input_size = config.hidden_size + pos_embedding_dim + fgpos_embedding_dim
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(classifier_input_size, config.num_labels)

        # Initialize weights
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        pos_tag_ids=None,
        fgpos_tag_ids=None,
        labels=None, # labels is still an argument, but we won't use it here
        **kwargs
    ):
        roberta_output = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        sequence_output = roberta_output[0]

        # --- Get POS embeddings (Compiler-Friendly version) ---
        pos_mask = (pos_tag_ids != -100).long()
        valid_pos_ids = torch.clamp(pos_tag_ids, min=0)
        pos_embeddings = self.pos_embedding(valid_pos_ids)
        pos_embeddings = pos_embeddings * pos_mask.unsqueeze(-1)

        # --- Get FGPOS embeddings (Compiler-Friendly version) ---
        fgpos_mask = (fgpos_tag_ids != -100).long()
        valid_fgpos_ids = torch.clamp(fgpos_tag_ids, min=0)
        fgpos_embeddings = self.fgpos_embedding(valid_fgpos_ids)
        fgpos_embeddings = fgpos_embeddings * fgpos_mask.unsqueeze(-1)

        # Concatenate RoBERTa output with both POS embeddings
        combined_output = torch.cat([sequence_output, pos_embeddings, fgpos_embeddings], dim=-1)
        
        combined_output = self.dropout(combined_output)
        logits = self.classifier(combined_output)

        # The model now ONLY returns logits. Loss calculation is handled by the Trainer.
        return TokenClassifierOutput(
            loss=None, # Loss is explicitly None
            logits=logits,
            hidden_states=roberta_output.hidden_states,
            attentions=roberta_output.attentions,
        )

In [9]:
# --- Define compute_metrics function ---
def compute_metrics(p):
    predictions, labels = p
    # predictions are logits, take argmax to get predicted class
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    # Flatten the arrays to work with scikit-learn metrics
    true_labels = []
    predicted_labels = []
    for prediction, label in zip(predictions, labels):
        for p_val, l_val in zip(prediction, label):
            if l_val != -100:
                true_labels.append(l_val)
                predicted_labels.append(p_val)

    # Convert to numpy arrays
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    # Calculate precision, recall, f1-score
    # 'binary' for 2 classes (0 and 1)
    # 'pos_label=1' means we focus on class 1 (figurative/metaphorical) as the positive class
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predicted_labels, average='binary', pos_label=1, zero_division=0
    )
    accuracy = accuracy_score(true_labels, predicted_labels)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [10]:
def get_class_weights(train_dataset):
    # The dataset now returns a dictionary, not a tuple
    labels_list = [x['labels'].numpy() for x in train_dataset]
    labels_flat = np.concatenate(labels_list)
    labels_filtered = labels_flat[labels_flat != -100]
    counts = Counter(labels_filtered)
    
    if len(counts) < 2:
        # Handle case where one class is missing in a fold
        return torch.tensor([1.0, 1.0], dtype=torch.float), counts, 0

    total = sum(counts.values())
    # Ensure we have counts for both classes to avoid division by zero
    weight_0 = total / counts.get(0, 1)
    weight_1 = total / counts.get(1, 1)
    
    return torch.tensor(
        [weight_0, weight_1], dtype=torch.float
    ), counts, total

In [11]:
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Move weights to the correct device during initialization
        if class_weights is not None:
            self.class_weights = class_weights.to(self.args.device)
        else:
            self.class_weights = None
        
        # Initialize the loss function once with the weights
        self.loss_fct = CrossEntropyLoss(weight=self.class_weights)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # The model now returns outputs with loss=None. We compute it here.
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # The loss function was already initialized with weights
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [12]:
K = 5  # number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=42)

fold_f1s = []
fold_precisions = []
fold_recalls = []
fold_losses = []

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_tokenized_data)):
    print(f"\n=== Fold {fold_idx + 1}/{K} ===")
    
    train_split = [train_tokenized_data[i] for i in train_idx]
    val_split = [train_tokenized_data[i] for i in val_idx]

    train_dataset_fold = MetaphorDatasetWithAllFeatures(train_split)
    val_dataset_fold = MetaphorDatasetWithAllFeatures(val_split)

    class_weights, _, _ = get_class_weights(train_dataset_fold)

    # Instantiate the custom model for each fold
    model = RobertaForTokenClassificationWithAllPOS.from_pretrained(
        model_name,
        num_labels=2,
        pos_vocab_size=pos_vocab_size,
        fgpos_vocab_size=fgpos_vocab_size,
        pos_embedding_dim=50, # Can be tuned
        fgpos_embedding_dim=20 # Reduced dimension for regularization
    )

    idx_folder = os.path.join('results_with_all_pos_reduced_dim_compiled', f'fold_{fold_idx + 1}')
    os.makedirs(idx_folder, exist_ok=True)
    
    training_args = TrainingArguments(
        output_dir=idx_folder,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="no", # We save manually at the end
        learning_rate=2e-5,
        per_device_train_batch_size=128, 
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=50,
        seed=42 + fold_idx,
        fp16=True, # Keep mixed-precision for performance
        dataloader_num_workers=0,
        dataloader_pin_memory=True, # Pin memory for faster data transfer
        torch_compile=False, # Disabling compilation due to persistent Windows errors
        remove_unused_columns=False, 
    )

    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_fold,
        eval_dataset=val_dataset_fold,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        class_weights=class_weights,
    )

    trainer.train()
    metrics = trainer.evaluate()

    fold_f1s.append(metrics["eval_f1"])
    fold_precisions.append(metrics["eval_precision"])
    fold_recalls.append(metrics["eval_recall"])
    fold_losses.append(metrics["eval_loss"])

    trainer.save_model(idx_folder)
    print(f"Saved model for fold {fold_idx + 1} to {idx_folder}")

# Aggregate results
mean_f1 = np.mean(fold_f1s)
std_f1 = np.std(fold_f1s)
mean_precision = np.mean(fold_precisions)
mean_recall = np.mean(fold_recalls)
mean_loss = np.mean(fold_losses)

print(f"\nCross-validated results over {K} folds (with POS features):")
print(f"F1: {mean_f1:.4f} ± {std_f1:.4f}")
print(f"Precision: {mean_precision:.4f}")
print(f"Recall: {mean_recall:.4f}")
print(f"Validation loss (mean): {mean_loss:.4f}")


=== Fold 1/5 ===


Some weights of RobertaForTokenClassificationWithAllPOS were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'fgpos_embedding.weight', 'pos_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5567,0.308222,0.841088,0.578613,0.426668,0.898635
2,0.3165,0.268383,0.886539,0.648674,0.51972,0.86274
3,0.2471,0.25965,0.892862,0.663907,0.536153,0.871587


Saved model for fold 1 to results_with_all_pos_reduced_dim_compiled\fold_1

=== Fold 2/5 ===


Some weights of RobertaForTokenClassificationWithAllPOS were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'fgpos_embedding.weight', 'pos_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5564,0.311768,0.855953,0.593972,0.449941,0.873631
2,0.321,0.277898,0.890828,0.656017,0.529044,0.863185
3,0.2573,0.266277,0.883177,0.648224,0.508935,0.892484


Saved model for fold 2 to results_with_all_pos_reduced_dim_compiled\fold_2

=== Fold 3/5 ===


Some weights of RobertaForTokenClassificationWithAllPOS were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'fgpos_embedding.weight', 'pos_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5456,0.310416,0.845073,0.584491,0.43347,0.897008
2,0.3139,0.271006,0.870835,0.631424,0.483214,0.910776
3,0.2516,0.267659,0.888331,0.659274,0.52378,0.88933


Saved model for fold 3 to results_with_all_pos_reduced_dim_compiled\fold_3

=== Fold 4/5 ===


Some weights of RobertaForTokenClassificationWithAllPOS were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'fgpos_embedding.weight', 'pos_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5558,0.316193,0.877796,0.612841,0.484231,0.834477
2,0.3206,0.275644,0.872769,0.621772,0.474315,0.902278
3,0.2528,0.263535,0.889568,0.65098,0.513646,0.888553


Saved model for fold 4 to results_with_all_pos_reduced_dim_compiled\fold_4

=== Fold 5/5 ===


Some weights of RobertaForTokenClassificationWithAllPOS were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'fgpos_embedding.weight', 'pos_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5509,0.311197,0.870252,0.611466,0.476541,0.852972
2,0.3207,0.268796,0.877614,0.636606,0.493803,0.895607
3,0.2514,0.263704,0.887325,0.654332,0.517019,0.890956


Saved model for fold 5 to results_with_all_pos_reduced_dim_compiled\fold_5

Cross-validated results over 5 folds (with POS features):
F1: 0.6553 ± 0.0056
Precision: 0.5199
Recall: 0.8866
Validation loss (mean): 0.2642


In [18]:
# --- ENSEMBLE EVALUATION ---

# Load all fold models
model_dirs = sorted(glob.glob(os.path.join("results_with_all_pos_reduced_dim_compiled", "fold_*")))
models = []
for d in model_dirs:
    if os.path.exists(os.path.join(d, "pytorch_model.bin")) or os.path.exists(os.path.join(d, "model.safetensors")):
        model = RobertaForTokenClassificationWithAllPOS.from_pretrained(
            d,
            pos_vocab_size=pos_vocab_size,
            fgpos_vocab_size=fgpos_vocab_size,
            pos_embedding_dim=50,
            fgpos_embedding_dim=20
        )
        models.append(model)
    else:
        print(f"Warning: Model not found in {d}, skipping.")

print(f"Loaded {len(models)} models for ensemble prediction.")

# Create a dummy trainer for prediction
if models:
    # Use a standard trainer for prediction as we don't need the weighted loss
    # We set remove_unused_columns=True so that the 'labels' column is not passed to the model,
    # which prevents the ValueError as the model doesn't compute loss.
    args = TrainingArguments(
        output_dir="./inference_tmp", 
        per_device_eval_batch_size=8,
        remove_unused_columns=True # This is key to prevent passing 'labels' to the model
    )
    # We only need one trainer, and we'll swap the model inside it
    predictor = Trainer(model=models[0], args=args, data_collator=data_collator)

# Create a version of the test dataset without labels for prediction
prediction_dataset = MetaphorDatasetWithAllFeatures([
    {k: v for k, v in item.items() if k != 'labels'} 
    for item in test_tokenized_data
])

# Get predictions
per_model_logits = []
for model in models:
    predictor.model = model.to(predictor.args.device) # Move model to correct device
    pred_out = predictor.predict(prediction_dataset) # Use the dataset without labels
    per_model_logits.append(pred_out.predictions)

per_model_logits = np.stack(per_model_logits, axis=0)

# --- Analysis by Adjusting Majority Vote Threshold ---
n_models = per_model_logits.shape[0]
per_model_preds = np.argmax(per_model_logits, axis=-1)

labels = np.stack([item['labels'].numpy() for item in test_dataset])
mask = labels != -100
y_true = labels[mask]

print("\n--- Evaluating Ensemble Performance by Adjusting Vote Count ---")
print(f"Required Votes | Precision | Recall    | F1-Score  | Accuracy")
print("---------------------------------------------------------------")

for required_votes in range(int(n_models / 2) + 1, n_models + 1):
    vote_sum = per_model_preds.sum(axis=0)
    y_pred_at_threshold = (vote_sum[mask] >= required_votes).astype(int)

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred_at_threshold, average="binary", pos_label=1, zero_division=0
    )
    acc = accuracy_score(y_true, y_pred_at_threshold)

    print(f"{required_votes} of {n_models}      | {prec:<9.4f} | {rec:<9.4f} | {f1:<9.4f} | {acc:<9.4f}")

Loaded 5 models for ensemble prediction.



--- Evaluating Ensemble Performance by Adjusting Vote Count ---
Required Votes | Precision | Recall    | F1-Score  | Accuracy
---------------------------------------------------------------
3 of 5      | 0.4412    | 0.8004    | 0.5689    | 0.7823   
4 of 5      | 0.4728    | 0.7539    | 0.5811    | 0.8050   
5 of 5      | 0.5169    | 0.6936    | 0.5924    | 0.8288   
