# Library Imports

We import all the necessary Python libraries used throughout the analysis.


In [1]:
import pandas as pd
import random
import re
import seaborn as sns
from typing import List, Tuple, Dict, Any
from transformers import BertTokenizer, BertForPreTraining, PreTrainedTokenizer, get_linear_schedule_with_warmup
import torch
import nltk
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
import os

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
class DataPreparation:
    """
    Prepares data for BERT-style pretraining tasks, including Masked Language Modeling (MLM), 
    from a dataset of text abstracts.

    Attributes:
        df (pd.DataFrame): DataFrame containing abstracts with a column 'abstract_clean'.
        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer compatible with BERT.
        max_length (int): Maximum token length for BERT inputs.
        mask_prob (float): Probability of masking tokens for MLM
    """
    def __init__(self, dataframe: pd.DataFrame, tokenizer: PreTrainedTokenizer,
                 max_length: int = 512, mask_prob: float = 0.15):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mask_prob = mask_prob
    

    def tokenize_and_segment_abstracts(self) -> Dict[str, Any]:
        """
        Segments abstracts into overlapping tokenized chunks for MLM training.

        Returns:
            dict: Dictionary containing:
                - input_ids (torch.Tensor): Tensor of token IDs with shape (num_segments, Max_length).
                - attention_mask (torch.Tensor): Attention masks for each segment.
                - token_type_ids (torch.Tensor): Token type IDs for each segment.
        """
        input_ids_all, attention_masks_all, token_type_ids_all = [], [], []

        for abstract in self.df["abstract_clean"]:
            sentences = sent_tokenize(abstract)
            token_ids = [self.tokenizer.cls_token_id]

            for sent in sentences:
                sent_ids = self.tokenizer.encode(sent, add_special_tokens=False)
                token_ids.extend(sent_ids + [self.tokenizer.sep_token_id])
                
            chunk = token_ids[: self.max_length]
            len_chunk = len(chunk)
                
            # Ensure CLS is at the beginning
            if chunk[0] != self.tokenizer.cls_token_id:
                    chunk = [self.tokenizer.cls_token_id] + chunk[:(len_chunk-1)]
            # Ensure SEP is at the end
            if chunk[-1] != self.tokenizer.sep_token_id:
                    chunk[-1]= self.tokenizer.sep_token_id
            pad_len = self.max_length - len_chunk

            chunk_padded = chunk + [self.tokenizer.pad_token_id] * pad_len
            attention_mask = [1] * len_chunk + [0] * pad_len
            token_type_ids = [0] * self.max_length

            input_ids_all.append(chunk_padded)
            attention_masks_all.append(attention_mask)
            token_type_ids_all.append(token_type_ids)
                

        return {
            "input_ids": torch.tensor(input_ids_all, dtype=torch.long),
            "attention_mask": torch.tensor(attention_masks_all, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids_all, dtype=torch.long),
        }
        
    def tokenize_and_segment_abstracts_2(self) -> Dict[str, Any]:
        """
        Segments abstracts into 512-token chunks for MLM training, and tracks how many segments
        each abstract was split into (for downstream use or tracking).
        
        Returns:
            dict: Dictionary containing:
                - input_ids (torch.Tensor)
                - attention_mask (torch.Tensor)
                - token_type_ids (torch.Tensor)
                - segment_index (torch.Tensor): 0 for first segment, 1 for second, etc.
        """
        input_ids_all, attention_masks_all, token_type_ids_all, segment_index_all = [], [], [], []
        stride = 50  
    
        for abstract in self.df["abstract_clean"]:
            sentences = sent_tokenize(abstract)
            token_ids = [self.tokenizer.cls_token_id]
    
            for sent in sentences:
                sent_ids = self.tokenizer.encode(sent, add_special_tokens=False)
                token_ids.extend(sent_ids + [self.tokenizer.sep_token_id])
    
            segment_count = 0
            start = 0
            while start < len(token_ids):
                chunk = token_ids[start:start + self.max_length]
                len_chunk = len(chunk)
    
                # Ensure CLS at start and SEP at end
                if chunk[0] != self.tokenizer.cls_token_id:
                    chunk = [self.tokenizer.cls_token_id] + chunk[:self.max_length - 1]
                if chunk[-1] != self.tokenizer.sep_token_id:
                    if len(chunk) == self.max_length:
                        chunk[-1] = self.tokenizer.sep_token_id
                    else:
                        chunk.append(self.tokenizer.sep_token_id)
    
                # Pad to max_length
                chunk = chunk[:self.max_length]
                pad_len = self.max_length - len(chunk)
                chunk_padded = chunk + [self.tokenizer.pad_token_id] * pad_len
                attention_mask = [1] * len(chunk) + [0] * pad_len
                token_type_ids = [0] * self.max_length
    
                input_ids_all.append(chunk_padded)
                attention_masks_all.append(attention_mask)
                token_type_ids_all.append(token_type_ids)
                segment_index_all.append(segment_count)
    
                segment_count += 1
                start += self.max_length - stride  # move forward with stride
    
        return {
            "input_ids": torch.tensor(input_ids_all, dtype=torch.long),
            "attention_mask": torch.tensor(attention_masks_all, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids_all, dtype=torch.long),
            "segment_index": torch.tensor(segment_index_all, dtype=torch.long),
        }


In [11]:
df = pd.read_csv("../Datasets/cleaned_dataset.csv")


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")


prep = DataPreparation(df, tokenizer)
mlm_encodings = prep.tokenize_and_segment_abstracts()


KeyboardInterrupt: 

In [4]:
# Count the non-zero values per row
non_zero_per_row = (mlm_encodings["input_ids"] != 0).sum(dim=1)
lista = []

# Store the result in a list
for i, count in enumerate(non_zero_per_row):
    lista.append(count.item())

# Print how many rows have more than 256 non-zero values
print("Num_ab", len([x for x in lista if x > 256]))

# Print how many rows have more than 300 non-zero values
print("Num_ab", len([x for x in lista if x > 300]))

# Find the indices where the value is greater than 300
indices = [i for i, x in enumerate(lista) if x > 300]

# save the indices to a CSV file

df_index = pd.DataFrame(indices, columns=['indices'])
#df_index.to_csv('../Datasets/Index_del.csv', index=False)

Num_ab 61465
Num_ab 31601


## Real length abstract

This part allows to identify the real len of the abstract and it is useful to create a boxplot

In [5]:
mlm_encodings_2 = prep.tokenize_and_segment_abstracts_2()

In [6]:
def reconstruct_original_sequences(encodings: Dict[str, torch.Tensor]) -> List[List[int]]:
    """
    Reconstructs the original token sequences (input_ids) from overlapping segments created with a stride.

    Args:
        encodings (dict): Output from the `tokenize_and_segment_abstracts_2()` function. 
                          Should contain "input_ids" and "segment_index" as keys.

    Returns:
        List[List[int]]: A list of reconstructed token sequences, one per abstract.
    """
    input_ids = encodings["input_ids"]
    segment_indices = encodings["segment_index"]

    reconstructed_sequences = []
    current_sequence = []
    last_segment_id = -1
    stride = 50
    max_len = 512

    for i, seg_id in enumerate(segment_indices):
        chunk = input_ids[i].tolist()

        # First segment of a new abstract
        if seg_id == 0:
            if current_sequence:
                reconstructed_sequences.append(current_sequence)
            current_sequence = chunk
        else:
            # Only add new tokens (excluding the overlapping stride)
            new_tokens = chunk[stride:]
            current_sequence += new_tokens

        last_segment_id = seg_id

    # Append the last reconstructed sequence
    if current_sequence:
        reconstructed_sequences.append(current_sequence)

    return reconstructed_sequences


In [7]:

reconstructed = reconstruct_original_sequences(mlm_encodings_2)

# Compute the actual length (i.e., number of tokens ≠ 0) for each reconstructed sequence
non_zero_lengths = [sum(1 for token in seq if token != 0) for seq in reconstructed]

# Convert the list of lengths into a pandas Series
non_zero_series = pd.Series(non_zero_lengths)

# Print descriptive statistics of the sequence lengths
print(non_zero_series.describe())


count    200094.000000
mean        218.001234
std          79.873906
min           4.000000
25%         162.000000
50%         214.000000
75%         271.000000
max         876.000000
dtype: float64


Compute percentage of sequences exceeding token length thresholds




In [8]:

# Total number of reconstructed sequences
total = len(non_zero_lengths)

# Compute the percentage of sequences longer than 512 tokens
percent_over_512 = sum(l > 512 for l in non_zero_lengths) / total * 100

# Compute the percentage of sequences longer than 300 tokens
percent_over_300 = sum(l > 300 for l in non_zero_lengths) / total * 100

# Compute the percentage of sequences longer than 256 tokens
percent_over_256 = sum(l > 256 for l in non_zero_lengths) / total * 100

# Print the results with percentage and absolute count
print(f"Percentage > 512 tokens: {percent_over_512:.2f}% ({sum(l > 512 for l in non_zero_lengths)} sequences)")
print(f"Percentage > 300 tokens: {percent_over_300:.2f}% ({sum(l > 300 for l in non_zero_lengths)} sequences)")
print(f"Percentage > 256 tokens: {percent_over_256:.2f}% ({sum(l > 256 for l in non_zero_lengths)} sequences)")


Percentage > 512 tokens: 0.03% (65 sequences)
Percentage > 300 tokens: 15.79% (31601 sequences)
Percentage > 256 tokens: 30.72% (61465 sequences)
