# Library Imports


In this section, we import all the necessary libraries for data preprocessing, tokenization, model training, and evaluation.


In [None]:
from transformers import BertTokenizer, BertForMaskedLM
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
import random
import nltk
from nltk.tokenize import sent_tokenize
from typing import List, Dict, Tuple

nltk.download('punkt_tab')


### Preprocessing

The abstracts are first split into sentences and tokenized with BERT’s tokenizer, adding the special tokens `[CLS]` at the start and `[SEP]` at the end of each sentence.

Since BERT has a maximum sequence length of 512 tokens, we segment longer token sequences into overlapping chunks of 512 tokens, with a stride of 462 tokens (512 - 50) to maintain some context overlap between segments.

Each chunk is padded to the maximum length, and attention masks and token type IDs are created accordingly. The output is a dictionary of tensors ready for input into the BERT model.


In [None]:
class TokenizerSegmenter:
    """
    A utility class for segmenting and tokenizing textual abstracts into
    overlapping BERT-compatible input chunks.

    This is especially useful for handling long abstracts that exceed the BERT
    maximum input length (typically 512 tokens). It applies a sliding window
    approach with a defined stride to create multiple segments per abstract,
    ensuring coverage while preserving sentence boundaries.

    Attributes:
        tokenizer (BertTokenizer): Pretrained BERT tokenizer for tokenizing text.
        max_length (int): Maximum sequence length for BERT input (default is 512).
    """

    def __init__(self, tokenizer: BertTokenizer, max_length: int = 512):
        """
        Initializes the TokenizerSegmenter.

        Args:
            tokenizer (BertTokenizer): The tokenizer used to encode the abstracts.
            max_length (int): Maximum length of tokenized input sequences.
        """
        self.tokenizer = tokenizer
        self.max_length = max_length

    def process(self, abstracts: List[str]) -> Dict[str, Any]:
        """
        Processes a list of abstracts into padded, tokenized BERT-compatible inputs
        using a sliding window strategy.

        Args:
            abstracts (List[str]): List of raw abstract texts to be tokenized and segmented.

        Returns:
            Dict[str, Any]: A dictionary containing:
                - 'input_ids' (Tensor): Padded token IDs for each segment.
                - 'attention_mask' (Tensor): Attention masks for each segment.
                - 'token_type_ids' (Tensor): Segment token type IDs (all zeros).
        """
        input_ids_all, attention_masks_all, token_type_ids_all = [], [], []
        stride = self.max_length - 50
        cls_id = self.tokenizer.cls_token_id
        sep_id = self.tokenizer.sep_token_id
        pad_id = self.tokenizer.pad_token_id


        for abstract in abstracts:
            # Tokenize full abstract at sentence level
            sentences = sent_tokenize(abstract)
            token_ids = [cls_id]
    
            for sent in sentences:
                sent_ids = self.tokenizer.encode(sent, add_special_tokens=False)
                token_ids.extend(sent_ids + [sep_id])

            for i in range(0, len(token_ids), stride):
                chunk = token_ids[i:i + self.max_length]
                len_chunk = len(chunk)

               # Ensure [CLS] at start and [SEP] at end
                if chunk[0] != cls_id:
                    chunk = [cls_id] + chunk[:len_chunk - 1]
                if chunk[-1] != sep_id:
                    chunk[-1] = sep_id

                pad_len = self.max_length - len_chunk

                chunk_padded = chunk + [self.tokenizer.pad_token_id] * pad_len
                attention_mask = [1] * len_chunk + [0] * pad_len
                token_type_ids = [0] * self.max_length

                input_ids_all.append(chunk_padded)
                attention_masks_all.append(attention_mask)
                token_type_ids_all.append(token_type_ids)
                i += stride

        return {
            "input_ids": torch.tensor(input_ids_all, dtype=torch.long),
            "attention_mask": torch.tensor(attention_masks_all, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids_all, dtype=torch.long),
        }


# Creation Dataset

In [None]:
class BERTPretrainingDataset(Dataset):
    """
    A custom PyTorch Dataset for BERT-style pretraining tasks, specifically designed
    to handle inputs for Masked Language Modeling (MLM).

    This dataset is expected to receive pre-tokenized and pre-processed inputs,
    including input_ids, token_type_ids, attention_mask, labels (for MLM).

    Attributes:
        inputs (Dict[str, torch.Tensor]): A dictionary containing all input fields required by BERT.
    """

    def __init__(self, inputs: Dict[str, torch.Tensor]):
        """
        Initializes the dataset with input tensors.

        Args:
            inputs (Dict[str, torch.Tensor]): Dictionary with keys:
                - 'input_ids': Token IDs tensor of shape (N, seq_len)
                - 'token_type_ids': Segment type IDs tensor
                - 'attention_mask': Attention mask tensor
                - 'labels': MLM labels tensor (-100 for non-masked tokens)
        """
        self.inputs = inputs

    def __len__(self) -> int:
        """
        Returns:
            int: Number of samples in the dataset.
        """
        return self.inputs['input_ids'].shape[0]

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """
        Fetches the input sample at the specified index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            Dict[str, Any]: A dictionary containing a single sample:
                - 'input_ids': Tensor
                - 'token_type_ids': Tensor
                - 'attention_mask': Tensor
                - 'labels': Tensor
        """
        return {key: self.inputs[key][idx] for key in self.inputs}

# Masking of dataset

In order to evaluate the model’s performance on the Masked Language Modeling (MLM) task, we apply three different masking strategies only during the inference phase. In all cases, special tokens such as [CLS], [SEP], and padding are excluded from masking:

- Random masking: 15% of the tokens are randomly selected and replaced with the [MASK] token (ID 103), following the standard BERT procedure.

- Domain-specific masking: only technical or scientific terms—such as named entities or specialized vocabulary—are masked. This allows us to assess how well the model has learned domain-specific knowledge.

- Non-technical masking: only common or non-domain-specific terms (e.g., stop words or general vocabulary) are masked. This strategy helps evaluate the model’s grasp of general language structure and context.

  

In [None]:
class BERTMasker:
    """
    Utility class for applying BERT-style masking to input token IDs
    for the Masked Language Modeling (MLM) objective.

    This class performs dynamic masking following the original BERT paper:
        - 80% of the time: replace token with [MASK]
        - 10% of the time: replace token with a random token
        - 10% of the time: keep token unchanged

    Special tokens such as [CLS], [SEP], and [PAD] are never masked.

    Attributes:
        tokenizer (BertTokenizer): A pretrained BERT tokenizer providing special token IDs.
        mask_prob (float): Probability of masking each token (default: 0.15).
    """

    def __init__(self, tokenizer: BertTokenizer, mask_prob: float = 0.15):
        """
        Initializes the BERTMasker with the tokenizer and masking probability.

        Args:
            tokenizer (BertTokenizer): The tokenizer used for accessing token IDs.
            mask_prob (float): Probability of masking each token (default: 15%).
        """
        self.tokenizer = tokenizer
        self.mask_prob = mask_prob

    def apply_masking(self, input_ids: Tensor) -> Tuple[Tensor, Tensor]:
        """
        Applies BERT-style masking to the input tensor of token IDs.

        Args:
            input_ids (Tensor): Tensor of shape (batch_size, seq_length) containing token IDs.

        Returns:
            Tuple[Tensor, Tensor]:
                - masked_input_ids (Tensor): Input tensor with some tokens replaced with [MASK], random, or unchanged.
                - labels (Tensor): Target labels for MLM loss computation, with -100 for unmasked positions.
        """
        masked_input_ids = input_ids.clone()
        labels = input_ids.clone()
        rand = torch.rand(input_ids.shape)

        # Create boolean mask for which tokens will be masked
        mask_arr = (
            (rand < self.mask_prob) &
            (input_ids != self.tokenizer.cls_token_id) &
            (input_ids != self.tokenizer.sep_token_id) &
            (input_ids != self.tokenizer.pad_token_id)
        )

        labels[~mask_arr] = -100  # Only compute loss on masked tokens

        for i in range(mask_arr.shape[0]):
            token_indices = torch.nonzero(mask_arr[i]).flatten().tolist()
            for token_idx in token_indices:
                prob = random.random()
                if prob < 0.8:
                    masked_input_ids[i, token_idx] = self.tokenizer.mask_token_id  # Replace with [MASK]
                elif prob < 0.9:
                    masked_input_ids[i, token_idx] = random.randint(0, self.tokenizer.vocab_size - 1)  # Random token
                # Else: leave unchanged

        return masked_input_ids, labels

# Evaluation

In [None]:
class MLMEvaluator:
    """
    Evaluates a BERT model on the Masked Language Modeling (MLM) objective.

    This class provides utility functions to:
      - Compute the loss and logits on a given batch.
      - Extract the top-k predicted token probabilities from the model's output.

    Attributes:
        model (BertForMaskedLM): A pretrained or fine-tuned BERT model for MLM evaluation.
    """

    def __init__(self, model: BertForMaskedLM):
        """
        Initializes the evaluator with a BERT MLM model.

        Args:
            model (BertForMaskedLM): The model to evaluate.
        """
        self.model = model
        self.model.eval()  # Set model to evaluation mode

    def evaluate(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """
        Evaluates the model on the provided input features.

        Args:
            features (Dict[str, Tensor]): A dictionary containing input tensors:
                - 'input_ids': Tensor of token IDs.
                - 'attention_mask': Tensor indicating attention (non-padding).
                - 'token_type_ids': Tensor of segment IDs (for NSP compatibility).
                - 'labels': Tensor of target token IDs with -100 for ignored positions.

        Returns:
            Dict[str, Tensor]: A dictionary with:
                - 'loss': Cross-entropy loss over masked tokens.
                - 'logits': Raw prediction scores from the model.
        """
        with torch.no_grad():
            outputs = self.model(
                input_ids=features["input_ids"],
                attention_mask=features["attention_mask"],
                token_type_ids=features["token_type_ids"],
                labels=features["labels"]
            )
            return {
                "loss": outputs.loss,
                "logits": outputs.logits
            }

   
    def get_token_probabilities(self, logits: Tensor, top_k: int = 5) -> List[List[Tuple[int, float]]]:
        """
        Extracts the top-k token predictions for each position in each sequence.
    
        Args:
            logits (Tensor): The model's output logits of shape (batch_size, seq_len, vocab_size).
            top_k (int): Number of top predictions to return per token position.
    
        Returns:
            List[List[Tuple[int, float]]]: Nested list of (token_id, probability) per token position.
        """
        top_probs, top_indices = torch.topk(torch.softmax(logits, dim=-1), k=top_k, dim=-1)
    
        return [
            [(int(tok_id), float(prob)) for tok_id, prob in zip(pos_indices, pos_probs)]
            for pos_indices, pos_probs in zip(top_indices.view(-1, top_k), top_probs.view(-1, top_k))
        ]
    



In [None]:
# 1. Setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Preprocessing
segmenter = TokenizerSegmenter(tokenizer)
masker = BERTMasker(tokenizer)

df = pd.read_csv("../Datasets/dataset_20k_val.csv")
features = segmenter.process(df["abstract_clean"])
features["input_ids"], features["labels"] = masker.apply_masking(features["input_ids"])

# 3. Dataset e DataLoader
dataset = BERTPretrainingDataset(features)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# 4. Evaluazione
evaluator = MLMEvaluator(model)
results = evaluator.evaluate_batches(dataloader, device)

print("Average loss:", results["average_loss"])