## 1. Library Imports


In this section, we import all the necessary libraries for data preprocessing, tokenization, model training, and evaluation.

In [2]:
import pandas as pd
import random
import torch
import re

from transformers import BertTokenizer, BertForPreTraining

from typing import List, Tuple, Dict, Any

import nltk
from nltk import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# 2. Dataset Loading

In this section, we load the dataset of scientific abstracts that were previously collected and cleaned

In [3]:
df = pd.read_csv("../Datasets/cleaned_dataset.csv",encoding='utf-8')

In [3]:
index_del = pd.read_csv("../Datasets/Index_del.csv")

# Dataset Creation

The following datasets have been prepared for various stages of the experimental pipeline:

- **Domain-Adaptive Pretraining (DAPT):**  
  Creation of three datasets consisting of:
  - 10K abstracts  
  - 50K abstracts  
  - 100K abstracts  

- **Masked Language Modeling (MLM) Evaluation:**  
  A separate dataset of 20K abstracts, not overlapping with the DAPT datasets, used to evaluate the model's performance on the MLM task.

- **Fine-Tuning:**  
  A residual dataset, composed of all remaining abstracts not used in DAPT or MLM evaluation, reserved for downstream fine-tuning tasks (e.g., classification).


In [4]:
# Parameters
DAPT_SAMPLE_SIZE = 100_000      # Total abstracts to sample for main dataset
SAMPLE_10K = 10_000             # Size of smaller subset from 100K
SAMPLE_50K = 50_000             # Size of larger subset from 100K 
VAL_SAMPLE_SIZE = 20_000        # Validation dataset size from outside the 100K
SEED = 123                     # Random seed for reproducibility

random.seed(SEED)

indices_to_delete = index_del['indices']

# delete indeces from df
df = df.drop(index=indices_to_delete, errors='ignore').reset_index()
N = len(df)  # Total number of abstracts available

# Step 1: Randomly sample 100K abstracts for the main domain-adaptive pretraining dataset
indices_100k = set(random.sample(range(N), DAPT_SAMPLE_SIZE))

# Step 2: From the 100K dataset, randomly select 10K abstracts
indices_10k = set(random.sample(indices_100k, SAMPLE_10K))

# Step 3: From the remaining 90K (100K minus 10K), select 50K abstracts
indices_50k = set(random.sample(indices_100k, SAMPLE_50K))

# Step 4: From the abstracts NOT in the 100K sample, randomly select 20K for validation using MLM
remaining_after_100k = set(range(N)) - indices_100k
indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))

# Step 5: The remaining abstracts not selected in any of the above datasets for Fine-Tuning
indices_remaining = set(range(N)) - indices_100k - indices_10k_val

# Create the five datasets using the selected indices
df_100k = df.loc[list(indices_100k)].reset_index()
df_10k = df.loc[list(indices_10k)].reset_index()
df_50k = df.loc[list(indices_50k)].reset_index()
df_10k_val = df.loc[list(indices_10k_val)].reset_index()
df_remaining = df.loc[list(indices_remaining)].reset_index()

since Python 3.9 and will be removed in a subsequent version.
  indices_10k = set(random.sample(indices_100k, SAMPLE_10K))
since Python 3.9 and will be removed in a subsequent version.
  indices_50k = set(random.sample(indices_100k, SAMPLE_50K))
since Python 3.9 and will be removed in a subsequent version.
  indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))


In [5]:
class TokenizerSegmenter:
    """
    A utility class for segmenting and tokenizing textual abstracts into
    overlapping BERT-compatible input chunks.

    This is especially useful for handling long abstracts that exceed the BERT
    maximum input length (typically 512 tokens). It applies a sliding window
    approach with a defined stride to create multiple segments per abstract,
    ensuring coverage while preserving sentence boundaries.

    Attributes:
        tokenizer (BertTokenizer): Pretrained BERT tokenizer for tokenizing text.
        max_length (int): Maximum sequence length for BERT input (default is 512).
    """

    def __init__(self, tokenizer: BertTokenizer, max_length: int = 512):
        """
        Initializes the TokenizerSegmenter.

        Args:
            tokenizer (BertTokenizer): The tokenizer used to encode the abstracts.
            max_length (int): Maximum length of tokenized input sequences.
        """
        self.tokenizer = tokenizer
        self.max_length = max_length

    def process(self, abstracts: List[str]) -> Dict[str, Any]:
        """
        Processes a list of abstracts into padded, tokenized BERT-compatible inputs
        using a sliding window strategy.

        Args:
            abstracts (List[str]): List of raw abstract texts to be tokenized and segmented.

        Returns:
            Dict[str, Any]: A dictionary containing:
                - 'input_ids' (Tensor): Padded token IDs for each segment.
                - 'attention_mask' (Tensor): Attention masks for each segment.
                - 'token_type_ids' (Tensor): Segment token type IDs (all zeros).
        """
        input_ids_all: List[List[int]] = []
        attention_masks_all: List[List[int]] = []
        token_type_ids_all: List[List[int]] = []
        abstract_ids_all: List[int] = []
        abstract_texts_all: List[str] = []

        for idx, abstract in enumerate(abstracts):
            sentences = sent_tokenize(abstract)
            token_ids: List[int] = [self.tokenizer.cls_token_id]

            # Encode each sentence and add [SEP] token
            for sent in sentences:
                sent_ids = self.tokenizer.encode(sent, add_special_tokens=False)
                token_ids.extend(sent_ids + [self.tokenizer.sep_token_id])

            
            chunk = token_ids

            # Ensure CLS and SEP tokens
            if chunk[0] != self.tokenizer.cls_token_id:
                    chunk = [self.tokenizer.cls_token_id] + chunk[:len(chunk)-1]
            if chunk[-1] != self.tokenizer.sep_token_id:
                    chunk[-1] = self.tokenizer.sep_token_id


            pad_len = self.max_length - len(chunk)

            chunk_padded = chunk + [self.tokenizer.pad_token_id] * pad_len
            attention_mask = [1] * len(chunk) + [0] * pad_len

            input_ids_all.append(chunk_padded)
            attention_masks_all.append(attention_mask)
                

        return {
            "input_ids": torch.tensor(input_ids_all, dtype=torch.long),
            "attention_mask": torch.tensor(attention_masks_all, dtype=torch.long),
        }


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")

prep = TokenizerSegmenter(tokenizer,max_length = 300)

prep_100k = prep.process(df_100k["abstract_clean"])
prep_10k = prep.process(df_10k["abstract_clean"])
prep_50k = prep.process(df_50k["abstract_clean"])
prep_10k_val = prep.process(df_10k_val["abstract_clean"])
prep_remaining = prep.process(df_remaining["abstract_clean"])

In [7]:
df_100k = pd.DataFrame({
    "abstract_clean": df_100k["abstract_clean"],
    "input_ids": prep_100k["input_ids"].tolist(),
    "attention_mask": prep_100k["attention_mask"].tolist()
})

df_10k = pd.DataFrame({
    "abstract_clean": df_10k["abstract_clean"],
    "input_ids": prep_10k["input_ids"].tolist(),
    "attention_mask": prep_10k["attention_mask"].tolist()
})

df_50k = pd.DataFrame({
    "abstract_clean": df_50k["abstract_clean"],
    "input_ids": prep_50k["input_ids"].tolist(),
    "attention_mask": prep_50k["attention_mask"].tolist()
    #"token_type_ids": prep_50k["token_type_ids"].tolist()
})

df_10k_val = pd.DataFrame({
    "abstract_clean": df_10k_val["abstract_clean"],
    "input_ids": prep_10k_val["input_ids"].tolist(),
    "attention_mask": prep_10k_val["attention_mask"].tolist()
})

df_remaining = pd.DataFrame({
    "abstract_clean": df_remaining["abstract_clean"],
    "primary_category": df_remaining["primary_category"],
    "input_ids": prep_remaining["input_ids"].tolist(),
    "attention_mask": prep_remaining["attention_mask"].tolist()
})

In [8]:
# Save datasets to CSV files with headers
df_100k.to_csv('../Datasets/dataset_100k.csv', header=True, index = False)
df_10k.to_csv('../Datasets/dataset_10k.csv', header=True, index = False)
df_50k.to_csv('../Datasets/dataset_50k.csv', header=True, index = False)
df_10k_val.to_csv('../Datasets/dataset_10k_val.csv', header=True, index = False)
df_remaining.to_csv('../Datasets/dataset_remaining.csv', header=True, index = False)