In [1]:
import os
import json
import torch
import numpy as np
from collections import Counter
from collections import defaultdict
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch.nn import CrossEntropyLoss
from transformers import DataCollatorForTokenClassification


from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification, # Added this import
    TrainingArguments,             # Added this import
    Trainer,                        # Added this import
)

from torch.utils.data import Dataset
import nltk


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "vua_dataset"
model_name = "roberta-base"

In [3]:
# Download NLTK punkt tokenizer data if you haven't already
# This block should be executed successfully before nltk.word_tokenize is used.
try:
    nltk.data.find('tokenizers/punkt')
except LookupError: # Catching LookupError as it's the specific error for resource not found
    print("NLTK 'punkt' tokenizer data not found. Downloading...")
    nltk.download('punkt', quiet=True) # Use quiet=True to suppress progress bar if preferred
    print("NLTK 'punkt' tokenizer data downloaded.")
except Exception as e:
    print(f"An unexpected error occurred during NLTK data check/download: {e}")


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aviad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# if model_name is something like "roberta-base"
if "roberta" in model_name.lower():
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,
        add_prefix_space=True,  # required for pre-tokenized input with RoBERTa
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True
    )


In [6]:
def load_and_process_data(json_path, dataset_name=""):
    """
    Loads raw data from a JSONL file, groups it by sentence,
    and processes it into a format suitable for MetaphorDataset.

    Args:
        json_path (str): The path to the JSONL data file.
        dataset_name (str): A name for the dataset (e.g., "TRAIN", "TEST") for logging.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              "sentence_words" (list of str) and "labels" (list of int).
    """
    data_raw = []
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            data_raw.append(json.loads(line))

    sentence_groups = defaultdict(list)
    for entry in data_raw:
        sentence_groups[entry["sentence"]].append(entry)

    processed_data = []
    for sentence, entries in sentence_groups.items():
        entries = sorted(entries, key=lambda x: x["w_index"])
        words_from_sentence = nltk.word_tokenize(sentence)
        current_labels = [0] * len(words_from_sentence)

        w_index_mismatch_found = False
        for entry in entries:
            word_index = entry["w_index"]
            label_value = entry["label"]
            if 0 <= word_index < len(words_from_sentence):
                current_labels[word_index] = label_value
            else:
                w_index_mismatch_found = True
                print(f"CRITICAL WARNING ({dataset_name}): w_index {word_index} out of bounds for NLTK tokenized sentence (length {len(words_from_sentence)}): '{sentence}'")
                print(f"NLTK Tokens: {words_from_sentence}")
        if w_index_mismatch_found:
            print(f"Skipping problematic {dataset_name} sentence due to w_index mismatch: '{sentence}'")
            continue
        processed_data.append({"sentence_words": words_from_sentence, "labels": current_labels})

    return processed_data

In [7]:
class MetaphorDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        sentence_words = entry["sentence_words"] # This is now the list of words
        word_labels = entry["labels"]  # list of 0/1 for each original word

        # Tokenize the input. is_split_into_words=True is crucial here
        # Temporarily omit return_tensors="pt" to get the BatchEncoding object first
        raw_encoding = tokenizer(
            sentence_words,
            truncation=True,
            padding="max_length",
            max_length=128,
            is_split_into_words=True, # Tells tokenizer input is already word-split
            # We will convert to tensors after getting word_ids
        )

        # Get word IDs from the raw_encoding object
        word_ids = raw_encoding.word_ids(batch_index=0) # batch_index=0 since we're processing one example at a time

        # Now, align the word_labels to the subword tokens
        labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            # Special tokens (CLS, SEP, PAD) or subword tokens that are not
            # the first part of a word are ignored (-100)
            if word_idx is None:
                labels.append(-100)
            # Only label the first subword token of a given original word
            elif word_idx != previous_word_idx:
                labels.append(word_labels[word_idx])
            # Subsequent subword tokens of the same word are ignored
            else:
                labels.append(-100)
            previous_word_idx = word_idx

        # Convert raw_encoding to tensors and add labels
        # This is where return_tensors="pt" functionality is applied
        encoding = {k: torch.tensor(v).squeeze(0) for k, v in raw_encoding.items()}
        encoding["labels"] = torch.tensor(labels, dtype=torch.long)

        # Ensure the labels list has the same length as input_ids
        assert len(labels) == len(encoding["input_ids"]), "Labels and input_ids length mismatch!"

        return encoding

In [8]:
# --- Load and process TRAIN data using the function ---
train_json_path = os.path.join("vua_dataset", "vua20_metaphor_train.json")
processed_train_data = load_and_process_data(train_json_path, dataset_name="TRAIN")

train_dataset = MetaphorDataset(processed_train_data)
print(f"Number of training samples: {len(train_dataset)}")

Number of training samples: 10909


In [9]:
# --- Load and process TEST data using the function ---
test_json_path = os.path.join("vua_dataset", "vua20_metaphor_test.json")
processed_test_data = load_and_process_data(test_json_path, dataset_name="TEST")
test_dataset = MetaphorDataset(processed_test_data)
print(f"Number of test samples: {len(test_dataset)}")

Number of test samples: 3601


In [10]:
# --- Define compute_metrics function ---
def compute_metrics(p):
    predictions, labels = p
    # predictions are logits, take argmax to get predicted class
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    # Flatten the arrays to work with scikit-learn metrics
    true_labels = []
    predicted_labels = []
    for prediction, label in zip(predictions, labels):
        for p_val, l_val in zip(prediction, label):
            if l_val != -100:
                true_labels.append(l_val)
                predicted_labels.append(p_val)

    # Convert to numpy arrays
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    # Calculate precision, recall, f1-score
    # 'binary' for 2 classes (0 and 1)
    # 'pos_label=1' means we focus on class 1 (figurative/metaphorical) as the positive class
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predicted_labels, average='binary', pos_label=1
    )
    accuracy = accuracy_score(true_labels, predicted_labels)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=2 # Assuming 0 for literal, 1 for figurative
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=4,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model checkpoint at the end of each epoch
    logging_dir="./logs",
    logging_steps=50,
    # Add these for clearer metrics during evaluation
    load_best_model_at_end=True, # Load the best model found during training based on eval_metric
    metric_for_best_model="eval_f1", # Or "eval_accuracy", "eval_f1" if you define compute_metrics
    greater_is_better=True, # For loss, lower is better
    
    learning_rate=2e-5,                # try a slightly smaller LR
    weight_decay=0.01,                 # regularization
    warmup_ratio=0.1,                  # linear warmup for first 10%
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset, # Now correctly defined
#     # You might want to add a data_collator and compute_metrics here later
#     # data_collator=DataCollatorForTokenClassification(tokenizer),
#     compute_metrics=compute_metrics,
# )

In [13]:
def get_class_weights(train_dataset):
    labels_flat = np.concatenate([x['labels'] for x in train_dataset])
    labels_filtered = labels_flat[labels_flat != -100]
    counts = Counter(labels_filtered)
    total = sum(counts.values())
    return torch.tensor(
        [total / counts[0], total / counts[1]], dtype=torch.float
    ), counts, total

In [23]:
class_weights, counts, _ = get_class_weights(train_dataset)
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        active_loss = labels.view(-1) != -100
        active_logits = logits.view(-1, model.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]

        weights = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(active_logits, active_labels)

        return (loss, outputs) if return_outputs else loss

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    class_weights=class_weights
)

In [16]:
print("Label counts:", counts)
print("Class weights:", class_weights)

Label counts: Counter({0: 159865, 1: 19122})
Class weights: tensor([1.1196, 9.3603])


In [31]:
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3038,0.349888,0.844719,0.429014,0.288502,0.83635
2,0.2399,0.358403,0.850698,0.449758,0.302688,0.874811
3,0.2135,0.352108,0.863691,0.464895,0.320095,0.848919


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=513, training_loss=0.29431310174060843, metrics={'train_runtime': 1036.5461, 'train_samples_per_second': 31.573, 'train_steps_per_second': 0.495, 'total_flos': 2137864739424768.0, 'train_loss': 0.29431310174060843, 'epoch': 3.0})

In [None]:
# 7. Evaluate
trainer.evaluate()

  return forward_call(*args, **kwargs)


{'eval_loss': 0.3521082103252411,
 'eval_accuracy': 0.8636905596857904,
 'eval_f1': 0.4648953744493392,
 'eval_precision': 0.3200947867298578,
 'eval_recall': 0.8489190548014077,
 'eval_runtime': 39.0851,
 'eval_samples_per_second': 92.132,
 'eval_steps_per_second': 23.052,
 'epoch': 3.0}

In [17]:
# --- Load and process TRAIN data using the function ---
train_json_path = os.path.join("vua_dataset", "vua20_metaphor_train.json")
processed_train_data = load_and_process_data(train_json_path, dataset_name="TRAIN")

In [18]:


train_data_split, val_data_split = train_test_split(
    processed_train_data,
    test_size=0.1, # Using 10% of the training data for validation
    random_state=42 # for reproducibility
)

train_dataset = MetaphorDataset(train_data_split)
val_dataset = MetaphorDataset(val_data_split)
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

Number of training samples: 9818
Number of validation samples: 1091


In [19]:
K = 5  # number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=42)

fold_f1s = []
fold_precisions = []
fold_recalls = []
fold_losses = []
trainers = []

In [None]:
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(processed_train_data)):
    print(f"\n=== Fold {fold_idx + 1}/{K} ===")
    # Split raw data (assuming processed_train_data is indexable list-like)
    idx_folder = os.path.join('results', f'fold_{fold_idx + 1}')
    os.makedirs(idx_folder, exist_ok=True)
    train_split = [processed_train_data[i] for i in train_idx]
    val_split = [processed_train_data[i] for i in val_idx]

    # Build datasets (your existing dataset wrapper handles tokenization/alignment inside)
    train_dataset = MetaphorDataset(train_split)
    val_dataset = MetaphorDataset(val_split)

    # Recompute class weights from this fold's train data
    class_weights, _, _ = get_class_weights(train_dataset)

    # Fresh model per fold
    if "roberta" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=model.config.num_labels if hasattr(model, "config") else model.config.num_labels if False else None)
    # (Above line may need adjustment to your existing instantiation logic; ensure num_labels is correct)

    # Training arguments: you can customize per fold output_dir to avoid overwrite
    training_args = TrainingArguments(
        output_dir=idx_folder,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        logging_steps=50,
        seed=42 + fold_idx,
    )

    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        class_weights=class_weights,
    )

    trainer.train()
    metrics = trainer.evaluate()

    fold_f1s.append(metrics["eval_f1"])
    fold_precisions.append(metrics["eval_precision"])
    fold_recalls.append(metrics["eval_recall"])
    fold_losses.append(metrics["eval_loss"])

    trainers.append(trainer)

# Aggregate results
mean_f1 = np.mean(fold_f1s)
std_f1 = np.std(fold_f1s)
mean_precision = np.mean(fold_precisions)
mean_recall = np.mean(fold_recalls)
mean_loss = np.mean(fold_losses)

print(f"\nCross-validated results over {K} folds:")
print(f"F1: {mean_f1:.4f} ± {std_f1:.4f}")
print(f"Precision: {mean_precision:.4f}")
print(f"Recall: {mean_recall:.4f}")
print(f"Validation loss (mean): {mean_loss:.4f}")


=== Fold 1/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2869,0.272813,0.857669,0.58324,0.426638,0.921474
2,0.2354,0.240904,0.879752,0.623365,0.471193,0.920712
3,0.1812,0.252221,0.91609,0.689501,0.574526,0.862008


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



=== Fold 2/5 ===


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2808,0.246109,0.888232,0.63345,0.488237,0.90161


  return forward_call(*args, **kwargs)
