# Library Imports


In this section, we import all the necessary libraries for data preprocessing, tokenization, model training, and evaluation.


In [1]:
from transformers import BertTokenizer, BertForMaskedLM
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
import random
import nltk
from nltk.tokenize import sent_tokenize
import yaml
from typing import List, Dict, Tuple, Any

import matplotlib.pyplot as plt

import ast
import os
import time

nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Preprocessing

The abstracts are first split into sentences and tokenized with BERT’s tokenizer, adding the special tokens `[CLS]` at the start and `[SEP]` at the end of each sentence.

Since BERT has a maximum sequence length of 512 tokens, we segment longer token sequences into overlapping chunks of 512 tokens, with a stride of 462 tokens (512 - 50) to maintain some context overlap between segments.

Each chunk is padded to the maximum length, and attention masks and token type IDs are created accordingly. The output is a dictionary of tensors ready for input into the BERT model.


In [2]:
class TokenizerSegmenter:
    """
    A utility class for segmenting and tokenizing textual abstracts into
    overlapping BERT-compatible input chunks.

    This is especially useful for handling long abstracts that exceed the BERT
    maximum input length (typically 512 tokens). It applies a sliding window
    approach with a defined stride to create multiple segments per abstract,
    ensuring coverage while preserving sentence boundaries.

    Attributes:
        tokenizer (BertTokenizer): Pretrained BERT tokenizer for tokenizing text.
        max_length (int): Maximum sequence length for BERT input (default is 512).
    """

    def __init__(self, dataframe: pd.DataFrame, tokenizer: BertTokenizer, max_length: int = 512):
        """
        Initializes the TokenizerSegmenter.

        Args:
            tokenizer (BertTokenizer): The tokenizer used to encode the abstracts.
            max_length (int): Maximum length of tokenized input sequences.
        """
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def process(self ) -> Dict[str, Any]:
        """
        Processes a list of abstracts into padded, tokenized BERT-compatible inputs
        using a sliding window strategy.

        Args:
            abstracts (List[str]): List of raw abstract texts to be tokenized and segmented.

        Returns:
            Dict[str, Any]: A dictionary containing:
                - 'input_ids' (Tensor): Padded token IDs for each segment.
                - 'attention_mask' (Tensor): Attention masks for each segment.
                - 'token_type_ids' (Tensor): Segment token type IDs (all zeros).
        """


        return {
            "input_ids": torch.tensor(self.df["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(self.df["attention_mask"], dtype=torch.long)
        }


# Creation Dataset

In [3]:
class BERTPretrainingDataset(Dataset):
    """
    A custom PyTorch Dataset for BERT-style pretraining tasks, specifically designed
    to handle inputs for Masked Language Modeling (MLM).

    This dataset is expected to receive pre-tokenized and pre-processed inputs,
    including input_ids, token_type_ids, attention_mask, labels (for MLM).

    Attributes:
        inputs (Dict[str, torch.Tensor]): A dictionary containing all input fields required by BERT.
    """

    def __init__(self, inputs: Dict[str, torch.Tensor]):
        """
        Initializes the dataset with input tensors.

        Args:
            inputs (Dict[str, torch.Tensor]): Dictionary with keys:
                - 'input_ids': Token IDs tensor of shape (N, seq_len)
                - 'token_type_ids': Segment type IDs tensor
                - 'attention_mask': Attention mask tensor
                - 'labels': MLM labels tensor (-100 for non-masked tokens)
        """
        self.inputs = inputs

    def __len__(self) -> int:
        """
        Returns:
            int: Number of samples in the dataset.
        """
        return self.inputs['input_ids'].shape[0]

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """
        Fetches the input sample at the specified index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            Dict[str, Any]: A dictionary containing a single sample:
                - 'input_ids': Tensor
                - 'token_type_ids': Tensor
                - 'attention_mask': Tensor
                - 'labels': Tensor
        """
        return {key: self.inputs[key][idx] for key in self.inputs}

# Masking of dataset

In order to evaluate the model’s performance on the Masked Language Modeling (MLM) task, we apply three different masking strategies only during the inference phase. In all cases, special tokens such as [CLS], [SEP], and padding are excluded from masking:

- Random masking: 15% of the tokens are randomly selected and replaced with the [MASK] token (ID 103), following the standard BERT procedure.

- Domain-specific masking: only technical or scientific terms—such as named entities or specialized vocabulary—are masked. This allows us to assess how well the model has learned domain-specific knowledge.

- Non-technical masking: only common or non-domain-specific terms (e.g., stop words or general vocabulary) are masked. This strategy helps evaluate the model’s grasp of general language structure and context.

  

In [4]:
class BERTMasker:
    """
    Utility class for applying BERT-style masking to input token IDs
    for the Masked Language Modeling (MLM) objective.

    This class performs dynamic masking following the original BERT paper:
        - 80% of the time: replace token with [MASK]
        - 10% of the time: replace token with a random token
        - 10% of the time: keep token unchanged

    Special tokens such as [CLS], [SEP], and [PAD] are never masked.

    Attributes:
        tokenizer (BertTokenizer): A pretrained BERT tokenizer providing special token IDs.
        mask_prob (float): Probability of masking each token (default: 0.15).
    """

    def __init__(self, tokenizer: BertTokenizer, mask_prob: float = 0.15):
        """
        Initializes the BERTMasker with the tokenizer and masking probability.

        Args:
            tokenizer (BertTokenizer): The tokenizer used for accessing token IDs.
            mask_prob (float): Probability of masking each token (default: 15%).
        """
        self.tokenizer = tokenizer
        self.mask_prob = mask_prob

    def apply_masking(self, input_ids: Tensor) -> Tuple[Tensor, Tensor]:
        """
        Applies BERT-style masking to the input tensor of token IDs.

        Args:
            input_ids (Tensor): Tensor of shape (batch_size, seq_length) containing token IDs.

        Returns:
            Tuple[Tensor, Tensor]:
                - masked_input_ids (Tensor): Input tensor with some tokens replaced with [MASK], random, or unchanged.
                - labels (Tensor): Target labels for MLM loss computation, with -100 for unmasked positions.
        """
        masked_input_ids = input_ids.clone()
        labels = input_ids.clone()
        torch.manual_seed(42)
        rand = torch.rand(input_ids.shape)
        # Create boolean mask for which tokens will be masked
        mask_arr = (
            (rand < 0.6) &
            (input_ids != self.tokenizer.cls_token_id) &
            (input_ids != self.tokenizer.sep_token_id) &
            (input_ids != self.tokenizer.pad_token_id)
        )
        labels[~mask_arr] = -100  # Only compute loss on masked tokens
       
        
        for i in range(mask_arr.shape[0]):
            token_indices = torch.nonzero(mask_arr[i]).flatten().tolist()
            for token_idx in token_indices:
                prob = random.random()
                if prob < 0.8:
                    masked_input_ids[i, token_idx] = self.tokenizer.mask_token_id  # Replace with [MASK]
                elif prob < 0.9:
                    masked_input_ids[i, token_idx] = random.randint(0, self.tokenizer.vocab_size - 1)  # Random token
                # Else: leave unchanged

        return masked_input_ids, labels

# Evaluation

In [5]:
class MLMEvaluator:
    """
    Evaluates a BERT model on the Masked Language Modeling (MLM) objective.

    This class provides utility functions to:
      - Compute the loss and logits on a given batch.
      - Extract the top-k predicted token probabilities from the model's output.

    Attributes:
        model (BertForMaskedLM): A pretrained or fine-tuned BERT model for MLM evaluation.
    """

    def __init__(self, model: BertForMaskedLM):
        """
        Initializes the evaluator with a BERT MLM model.

        Args:
            model (BertForMaskedLM): The model to evaluate.
        """
        self.model = model
        self.model.eval()  # Set model to evaluation mode

    def evaluate(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """
        Evaluates the model on the provided input features for MLM.
        Returns loss, logits, and MLM accuracy.
        evaluate(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]:
    
            Args:
                features (Dict[str, Tensor]): A dictionary containing input tensors:
                    - 'input_ids': Tensor of token IDs.
                    - 'attention_mask': Tensor indicating attention (non-padding).
                    - 'token_type_ids': Tensor of segment IDs (for NSP compatibility).
                    - 'labels': Tensor of target token IDs with -100 for ignored positions.
    
            Returns:
                Dict[str, Tensor]: A dictionary with:
                    - 'loss': Cross-entropy loss over masked tokens.
                    - 'logits': Raw prediction scores from the model.
        """
        
        with torch.no_grad():
            outputs = self.model(
                input_ids=features["input_ids"],
                attention_mask=features["attention_mask"],
                #token_type_ids=features["token_type_ids"],
                labels=features["labels"]
            )
    
            
            logits = outputs.logits  # Shape: [batch_size, sequence_length, vocabulary_size]
            
            
            predictions = torch.argmax(logits, dim=-1)  # Shape: [batch_size, sequence_length]
            
            # Extract the ground-truth labels (masked positions have real token IDs; others are -100)
            labels = features["labels"]
            
            mask = labels != -100  # Shape: [batch_size, sequence_length]
            
            # Compare predictions to labels only at masked positions
            correct = (predictions == labels) & mask  

            
            # Count the number of correct predictions and total masked tokens
            num_correct = correct.sum().item()
            num_masked = mask.sum().item()
            
            # Compute accuracy: correct / total (avoid division by zero)
            accuracy = num_correct / num_masked if num_masked > 0 else 0.0
            
            # Return loss, logits, and computed MLM accuracy
            return {
                "loss": outputs.loss,
                "logits": logits,
                "mlm_accuracy": accuracy
            }

    def evaluate_mlm(self, dataloader: torch.utils.data.DataLoader, device: torch.device) -> Dict[str, float]:
        """
        Evaluate the model on an entire DataLoader for the MLM objective.
        
        Args:
            dataloader (DataLoader): batch of inputs (input_ids, attention_mask, labels)
            device (torch.device): CPU or GPU
        
        Returns:
            dict: {
                'avg_loss': Average loss over all batches,
                'avg_accuracy': Average accuracy on the masked positions
            }
        """
        self.model.to(device)
        self.model.eval()

        total_loss = 0.0
        total_accuracy = 0.0
        num_batches = 0

        with torch.no_grad():
            for batch in dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                result = self.evaluate(batch)
                if result["loss"].item() != np.nan:
                   total_loss += result["loss"].item()
                
                total_accuracy += result["mlm_accuracy"]
                num_batches += 1

        avg_loss = total_loss / num_batches
        avg_accuracy = total_accuracy / num_batches

        return {
            "avg_loss": avg_loss,
            "avg_accuracy": avg_accuracy
        }
        
    def get_token_probabilities(self, logits: Tensor, top_k: int = 5) -> List[List[Tuple[int, float]]]:
        """
        Extracts the top-k token predictions for each position in each sequence.
    
        Args:
            logits (Tensor): The model's output logits of shape (batch_size, seq_len, vocab_size).
            top_k (int): Number of top predictions to return per token position.
    
        Returns:
            List[List[Tuple[int, float]]]: Nested list of (token_id, probability) per token position.
        """
        top_probs, top_indices = torch.topk(torch.softmax(logits, dim=-1), k=top_k, dim=-1)
    
        return [
            [(int(tok_id), float(prob)) for tok_id, prob in zip(pos_indices, pos_probs)]
            for pos_indices, pos_probs in zip(top_indices.view(-1, top_k), top_probs.view(-1, top_k))
        ]
    



In [6]:
# Start timer
start_time = time.time()

# 1. Load config from YAML
with open("../File_Yaml/Esecuzione_Perf_10k_SciBERT.yaml", "r") as f:
    config = yaml.safe_load(f)

# 2. Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained(config["tokenizer_name"])
model = BertForMaskedLM.from_pretrained(config["model_path"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3. Load dataset
df = pd.read_csv(config["data_path"])
colonne_da_convertire = ["input_ids", "attention_mask"]

for col in colonne_da_convertire:
    df[col] = df[col].apply(ast.literal_eval)

# 4. Tokenization and masking
segmenter = TokenizerSegmenter(df, tokenizer, max_length=config["max_length"])
features = segmenter.process()
masker = BERTMasker(tokenizer)
features["input_ids"], features["labels"] = masker.apply_masking(features["input_ids"])

# 5. Dataset & Dataloader
dataset = BERTPretrainingDataset(features)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=config["batch_size"], shuffle=False)

# 6. Evaluation
evaluator = MLMEvaluator(model)
perf = evaluator.evaluate_mlm(dataloader, device)

# 7. Calculate total execution time in minutes
execution_time_min = (time.time() - start_time) / 60

# 8. Extract model name from config["model_path"]
model_name = config["model_path"].split("/")[2]

# 9. Path to CSV file
csv_path = "../Datasets/Perf_Model.csv"

# 10. Save results to CSV
df_perf = pd.DataFrame([{
    "model": model_name,
    "avg_loss": perf["avg_loss"],
    "avg_accuracy": round(perf["avg_accuracy"],3)
}])

if os.path.exists(csv_path):
    df_perf.to_csv(csv_path, mode='a', index=False, header=False)
else:
    df_perf.to_csv(csv_path, index=False)

print(f"Results saved in {csv_path}")
print(f"Execution time: {execution_time_min:.2f} minutes")


Results saved in ../Datasets/Perf_Model.csv
Execution time: 6.44 minutes


In [7]:
"""# Data
loss = [1.6, 1.7, 1.9]
time = [408, 257, 248]
labels = ['DAPT 100k', 'DAPT 50k', 'DAPT 10k']

# Create folder if it does not exist
output_path = "../Plots"
os.makedirs(output_path, exist_ok=True)

# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(time, loss, color='orange')

# Add labels to the points
for i, label in enumerate(labels):
    plt.annotate(label, (time[i] + 10, loss[i] + 0.01))

# Titles and axes
plt.title('Model Comparison: Loss vs Execution Time')
plt.xlabel('Execution Time (minutes)')
plt.ylabel('Final Loss')

# Axis limits
plt.xlim(min(time) - 10, max(time) + 50)
plt.ylim(min(loss) - 0.05, max(loss) + 0.1)

# Adjust layout
plt.tight_layout()

# Save the plot
output_file = os.path.join(output_path, "loss_vs_time.png")
plt.savefig(output_file, dpi=300)

# Show the plot
plt.show()
"""

'import matplotlib.pyplot as plt\nimport os\n\n# Dati\nloss = [1.6, 1.7, 1.9]\ntempo = [408, 257, 248]\netichette = [\'DAPT 100k\', \'DAPT 50k\', \'DAPT 10k\']\n\n# Crea cartella se non esiste\noutput_path = "../Plots"\nos.makedirs(output_path, exist_ok=True)\n\n# Crea scatter plot\nplt.figure(figsize=(8, 6))\nplt.scatter(tempo, loss, color=\'orange\')\n\n# Aggiungi etichette ai punti\nfor i, label in enumerate(etichette):\n    plt.annotate(label, (tempo[i] + 10, loss[i] + 0.01))\n\n# Titoli e assi\nplt.title(\'Confronto Modelli: Loss vs Tempo di Esecuzione\')\nplt.xlabel(\'Tempo di esecuzione (minuti)\')\nplt.ylabel(\'Loss finale\')\n\n# Limiti assi\nplt.xlim(min(tempo) - 10, max(tempo) + 50)\nplt.ylim(min(loss) - 0.05, max(loss) + 0.1)\n\n# Rimuovi griglia\nplt.tight_layout()\n\n# Salva il grafico\noutput_file = os.path.join(output_path, "loss_vs_tempo.png")\nplt.savefig(output_file, dpi=300)\n\n# Mostra il grafico\nplt.show()\n'