In [23]:
import os
import json
import torch
import numpy as np
import glob

from collections import Counter
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch.nn import CrossEntropyLoss
from transformers import DataCollatorForTokenClassification


from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification, # Added this import
    TrainingArguments,             # Added this import
    Trainer,                        # Added this import
)

from torch.utils.data import Dataset
import nltk


In [3]:
data_dir = "vua_dataset"
model_name = "roberta-base"

In [4]:
# Download NLTK punkt tokenizer data if you haven't already
# This block should be executed successfully before nltk.word_tokenize is used.
try:
    nltk.data.find('tokenizers/punkt')
except LookupError: # Catching LookupError as it's the specific error for resource not found
    print("NLTK 'punkt' tokenizer data not found. Downloading...")
    nltk.download('punkt', quiet=True) # Use quiet=True to suppress progress bar if preferred
    print("NLTK 'punkt' tokenizer data downloaded.")
except Exception as e:
    print(f"An unexpected error occurred during NLTK data check/download: {e}")


In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aviad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# if model_name is something like "roberta-base"
if "roberta" in model_name.lower():
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,
        add_prefix_space=True,  # required for pre-tokenized input with RoBERTa
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True
    )


In [7]:
def load_and_process_data(json_path, dataset_name=""):
    """
    Loads raw data from a JSONL file, groups it by sentence,
    and processes it into a format suitable for MetaphorDataset.

    Args:
        json_path (str): The path to the JSONL data file.
        dataset_name (str): A name for the dataset (e.g., "TRAIN", "TEST") for logging.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              "sentence_words" (list of str) and "labels" (list of int).
    """
    data_raw = []
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            data_raw.append(json.loads(line))

    sentence_groups = defaultdict(list)
    for entry in data_raw:
        sentence_groups[entry["sentence"]].append(entry)

    processed_data = []
    for sentence, entries in sentence_groups.items():
        entries = sorted(entries, key=lambda x: x["w_index"])
        words_from_sentence = nltk.word_tokenize(sentence)
        current_labels = [0] * len(words_from_sentence)

        w_index_mismatch_found = False
        for entry in entries:
            word_index = entry["w_index"]
            label_value = entry["label"]
            if 0 <= word_index < len(words_from_sentence):
                current_labels[word_index] = label_value
            else:
                w_index_mismatch_found = True
                print(f"CRITICAL WARNING ({dataset_name}): w_index {word_index} out of bounds for NLTK tokenized sentence (length {len(words_from_sentence)}): '{sentence}'")
                print(f"NLTK Tokens: {words_from_sentence}")
        if w_index_mismatch_found:
            print(f"Skipping problematic {dataset_name} sentence due to w_index mismatch: '{sentence}'")
            continue
        processed_data.append({"sentence_words": words_from_sentence, "labels": current_labels})

    return processed_data

In [8]:
class MetaphorDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        sentence_words = entry["sentence_words"] # This is now the list of words
        word_labels = entry["labels"]  # list of 0/1 for each original word

        # Tokenize the input. is_split_into_words=True is crucial here
        # Temporarily omit return_tensors="pt" to get the BatchEncoding object first
        raw_encoding = tokenizer(
            sentence_words,
            truncation=True,
            padding="max_length",
            max_length=128,
            is_split_into_words=True, # Tells tokenizer input is already word-split
            # We will convert to tensors after getting word_ids
        )

        # Get word IDs from the raw_encoding object
        word_ids = raw_encoding.word_ids(batch_index=0) # batch_index=0 since we're processing one example at a time

        # Now, align the word_labels to the subword tokens
        labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            # Special tokens (CLS, SEP, PAD) or subword tokens that are not
            # the first part of a word are ignored (-100)
            if word_idx is None:
                labels.append(-100)
            # Only label the first subword token of a given original word
            elif word_idx != previous_word_idx:
                labels.append(word_labels[word_idx])
            # Subsequent subword tokens of the same word are ignored
            else:
                labels.append(-100)
            previous_word_idx = word_idx

        # Convert raw_encoding to tensors and add labels
        # This is where return_tensors="pt" functionality is applied
        encoding = {k: torch.tensor(v).squeeze(0) for k, v in raw_encoding.items()}
        encoding["labels"] = torch.tensor(labels, dtype=torch.long)

        # Ensure the labels list has the same length as input_ids
        assert len(labels) == len(encoding["input_ids"]), "Labels and input_ids length mismatch!"

        return encoding

In [9]:
# --- Load and process TRAIN data using the function ---
train_json_path = os.path.join("vua_dataset", "vua20_metaphor_train.json")
processed_train_data = load_and_process_data(train_json_path, dataset_name="TRAIN")

train_dataset = MetaphorDataset(processed_train_data)
print(f"Number of training samples: {len(train_dataset)}")

Number of training samples: 10909


In [10]:
# --- Load and process TEST data using the function ---
test_json_path = os.path.join("vua_dataset", "vua20_metaphor_test.json")
processed_test_data = load_and_process_data(test_json_path, dataset_name="TEST")
test_dataset = MetaphorDataset(processed_test_data)
print(f"Number of test samples: {len(test_dataset)}")

Number of test samples: 3601


In [11]:
# --- Define compute_metrics function ---
def compute_metrics(p):
    predictions, labels = p
    # predictions are logits, take argmax to get predicted class
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    # Flatten the arrays to work with scikit-learn metrics
    true_labels = []
    predicted_labels = []
    for prediction, label in zip(predictions, labels):
        for p_val, l_val in zip(prediction, label):
            if l_val != -100:
                true_labels.append(l_val)
                predicted_labels.append(p_val)

    # Convert to numpy arrays
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    # Calculate precision, recall, f1-score
    # 'binary' for 2 classes (0 and 1)
    # 'pos_label=1' means we focus on class 1 (figurative/metaphorical) as the positive class
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predicted_labels, average='binary', pos_label=1
    )
    accuracy = accuracy_score(true_labels, predicted_labels)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=2 # Assuming 0 for literal, 1 for figurative
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=4,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model checkpoint at the end of each epoch
    logging_dir="./logs",
    logging_steps=50,
    # Add these for clearer metrics during evaluation
    load_best_model_at_end=True, # Load the best model found during training based on eval_metric
    metric_for_best_model="eval_f1", # Or "eval_accuracy", "eval_f1" if you define compute_metrics
    greater_is_better=True, # For loss, lower is better
    
    learning_rate=2e-5,                # try a slightly smaller LR
    weight_decay=0.01,                 # regularization
    warmup_ratio=0.1,                  # linear warmup for first 10%
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset, # Now correctly defined
#     # You might want to add a data_collator and compute_metrics here later
#     # data_collator=DataCollatorForTokenClassification(tokenizer),
#     compute_metrics=compute_metrics,
# )

In [14]:
def get_class_weights(train_dataset):
    labels_flat = np.concatenate([x['labels'] for x in train_dataset])
    labels_filtered = labels_flat[labels_flat != -100]
    counts = Counter(labels_filtered)
    total = sum(counts.values())
    return torch.tensor(
        [total / counts[0], total / counts[1]], dtype=torch.float
    ), counts, total

In [15]:
class_weights, counts, _ = get_class_weights(train_dataset)
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        active_loss = labels.view(-1) != -100
        active_logits = logits.view(-1, model.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]

        weights = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(active_logits, active_labels)

        return (loss, outputs) if return_outputs else loss

In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    class_weights=class_weights
)

In [16]:
print("Label counts:", counts)
print("Class weights:", class_weights)

Label counts: Counter({0: 159865, 1: 19122})
Class weights: tensor([1.1196, 9.3603])


In [17]:
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3099,0.343671,0.855976,0.439394,0.301574,0.809201
2,0.2386,0.347952,0.860254,0.457676,0.313771,0.8454
3,0.215,0.355157,0.864515,0.466257,0.321459,0.848416


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=513, training_loss=0.29126956774244994, metrics={'train_runtime': 516.3322, 'train_samples_per_second': 63.384, 'train_steps_per_second': 0.994, 'total_flos': 2137864739424768.0, 'train_loss': 0.29126956774244994, 'epoch': 3.0})

In [18]:
# 7. Evaluate
trainer.evaluate()

  return forward_call(*args, **kwargs)


{'eval_loss': 0.35515719652175903,
 'eval_accuracy': 0.8645146584373685,
 'eval_f1': 0.46625682116460593,
 'eval_precision': 0.321459186589199,
 'eval_recall': 0.8484162895927602,
 'eval_runtime': 23.2949,
 'eval_samples_per_second': 154.583,
 'eval_steps_per_second': 38.678,
 'epoch': 3.0}

In [19]:
# --- Load and process TRAIN data using the function ---
train_json_path = os.path.join("vua_dataset", "vua20_metaphor_train.json")
processed_train_data = load_and_process_data(train_json_path, dataset_name="TRAIN")

In [20]:


train_data_split, val_data_split = train_test_split(
    processed_train_data,
    test_size=0.1, # Using 10% of the training data for validation
    random_state=42 # for reproducibility
)

train_dataset = MetaphorDataset(train_data_split)
val_dataset = MetaphorDataset(val_data_split)
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

Number of training samples: 9818
Number of validation samples: 1091


In [None]:
K = 5  # number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=42)

fold_f1s = []
fold_precisions = []
fold_recalls = []
fold_losses = []

In [None]:
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(processed_train_data)):
    print(f"\n=== Fold {fold_idx + 1}/{K} ===")
    # Split raw data (assuming processed_train_data is indexable list-like)
    idx_folder = os.path.join('results', f'fold_{fold_idx + 1}')
    os.makedirs(idx_folder, exist_ok=True)
    train_split = [processed_train_data[i] for i in train_idx]
    val_split = [processed_train_data[i] for i in val_idx]

    # Build datasets (your existing dataset wrapper handles tokenization/alignment inside)
    train_dataset = MetaphorDataset(train_split)
    val_dataset = MetaphorDataset(val_split)

    # Recompute class weights from this fold's train data
    class_weights, _, _ = get_class_weights(train_dataset)

    # Fresh model per fold
    if "roberta" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=model.config.num_labels if hasattr(model, "config") else model.config.num_labels if False else None)
    # (Above line may need adjustment to your existing instantiation logic; ensure num_labels is correct)

    # Training arguments: you can customize per fold output_dir to avoid overwrite
    training_args = TrainingArguments(
        output_dir=idx_folder,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        warmup_ratio=0.1,
        load_best_model_at_end=False,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        logging_steps=50,
        seed=42 + fold_idx,
    )

    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        class_weights=class_weights,
    )

    trainer.train()
    metrics = trainer.evaluate()

    fold_f1s.append(metrics["eval_f1"])
    fold_precisions.append(metrics["eval_precision"])
    fold_recalls.append(metrics["eval_recall"])
    fold_losses.append(metrics["eval_loss"])

    trainer.save_model(idx_folder)
    print(f"Saved model for fold {fold_idx + 1} to {idx_folder}")

# Aggregate results
mean_f1 = np.mean(fold_f1s)
std_f1 = np.std(fold_f1s)
mean_precision = np.mean(fold_precisions)
mean_recall = np.mean(fold_recalls)
mean_loss = np.mean(fold_losses)

print(f"\nCross-validated results over {K} folds:")
print(f"F1: {mean_f1:.4f} ± {std_f1:.4f}")
print(f"Precision: {mean_precision:.4f}")
print(f"Recall: {mean_recall:.4f}")
print(f"Validation loss (mean): {mean_loss:.4f}")


=== Fold 1/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3563,0.283828,0.852258,0.572926,0.416628,0.9169
2,0.2522,0.251078,0.894721,0.644269,0.507456,0.882084
3,0.2093,0.247528,0.903318,0.662835,0.531899,0.879288


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



=== Fold 2/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3406,0.261744,0.864882,0.592336,0.437584,0.916432
2,0.2593,0.234544,0.890832,0.641946,0.49481,0.913621
3,0.2213,0.22666,0.903315,0.665973,0.528599,0.899821


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



=== Fold 3/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3638,0.270139,0.888047,0.630025,0.491601,0.876956
2,0.2717,0.239525,0.900585,0.660565,0.525196,0.88995
3,0.2131,0.238742,0.908166,0.67556,0.548355,0.879608


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



=== Fold 4/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3372,0.266568,0.883512,0.611286,0.467404,0.883145
2,0.2568,0.245114,0.892808,0.636777,0.490912,0.905966
3,0.2199,0.240968,0.906867,0.663507,0.530565,0.885345


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



=== Fold 5/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3568,0.266061,0.886819,0.623936,0.48292,0.881273
2,0.2569,0.236949,0.888969,0.636667,0.488717,0.913088
3,0.2255,0.23527,0.907488,0.671815,0.539997,0.888774


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



Cross-validated results over 5 folds:
F1: 0.6679 ± 0.0050
Precision: 0.5359
Recall: 0.8866
Validation loss (mean): 0.2378


In [21]:
# Minimal loader: load all fold models into `trainers`
model_dirs = sorted(glob.glob(os.path.join("results", "fold_*")))

trainers = []
for d in model_dirs:
    model = AutoModelForTokenClassification.from_pretrained(d)
    args = TrainingArguments(output_dir=os.path.join(d, "inference_tmp"), per_device_eval_batch_size=8)
    tr = Trainer(model=model, args=args, data_collator=data_collator)
    trainers.append(tr)

print(f"Loaded {len(trainers)} trainers.")

Loaded 5 trainers.


In [22]:
# --- Analysis of Classification Thresholds ---

# 1) Get raw logits from each model in the ensemble
per_model_logits = []
if not trainers:
    print("Trainers list is empty. Please run the model loading cell first.")
else:
    for t in trainers:
        pred_out = t.predict(test_dataset)
        per_model_logits.append(pred_out.predictions)

    per_model_logits = np.stack(per_model_logits, axis=0)  # [n_models, n_samples, seq_len, n_labels]

    # 2) Convert logits to probabilities and average them
    per_model_probs = softmax(per_model_logits, axis=-1)  # Softmax over the label dimension
    ensemble_probs = per_model_probs.mean(axis=0)  # Average probs across models -> [n_samples, seq_len, n_labels]

    # 3) Get the probabilities for the "metaphor" class (class 1)
    ensemble_probs_class1 = ensemble_probs[..., 1]  # Shape: [n_samples, seq_len]

    # 4) Get the ground truth labels and the mask to ignore -100 values
    labels = np.stack([
        (test_dataset[i]["labels"].numpy() if hasattr(test_dataset[i]["labels"], "numpy") else np.array(test_dataset[i]["labels"]))
        for i in range(len(test_dataset))
    ])
    mask = labels != -100
    y_true = labels[mask]

    # 5) Iterate through different thresholds and evaluate metrics
    print("--- Evaluating Ensemble Performance at Different Thresholds ---")
    print("Threshold | Precision | Recall    | F1-Score  | Accuracy")
    print("----------------------------------------------------------")

    for threshold in np.arange(0.5, 1.0, 0.05):
        # Apply threshold to the probabilities of the positive class
        # Get predictions only for the valid (unmasked) tokens
        y_pred_at_threshold = (ensemble_probs_class1[mask] >= threshold).astype(int)

        # Calculate metrics
        # zero_division=0 prevents warnings when a class is not predicted
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred_at_threshold, average="binary", pos_label=1, zero_division=0
        )
        acc = accuracy_score(y_true, y_pred_at_threshold)

        print(f"{threshold:<9.2f} | {prec:<9.4f} | {rec:<9.4f} | {f1:<9.4f} | {acc:<9.4f}")


  return forward_call(*args, **kwargs)


--- Evaluating Ensemble Performance at Different Thresholds ---
Threshold | Precision | Recall    | F1-Score  | Accuracy
----------------------------------------------------------
0.50      | 0.3248    | 0.8457    | 0.4693    | 0.8666   
0.55      | 0.3341    | 0.8243    | 0.4755    | 0.8731   
0.60      | 0.3429    | 0.8009    | 0.4802    | 0.8791   
0.65      | 0.3521    | 0.7735    | 0.4840    | 0.8849   
0.70      | 0.3615    | 0.7443    | 0.4866    | 0.8905   
0.75      | 0.3694    | 0.7064    | 0.4851    | 0.8954   
0.80      | 0.3761    | 0.6599    | 0.4791    | 0.8999   
0.85      | 0.3830    | 0.6006    | 0.4677    | 0.9047   
0.90      | 0.3800    | 0.5055    | 0.4339    | 0.9080   
0.95      | 0.3562    | 0.3477    | 0.3519    | 0.9107   
