## 1. Library Imports


In this section, we import all the necessary libraries for data preprocessing, tokenization, model training, and evaluation.

In [1]:
import pandas as pd
import random
import torch
from transformers import AutoTokenizer
import nltk
from nltk import sent_tokenize

# Download punkt for sentence tokenization
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# 2. Dataset Loading

In this section, we load the dataset of scientific abstracts that were previously collected and cleaned

In [2]:
df = pd.read_csv("../Datasets/cleaned_dataset.csv",encoding='utf-8')

In [3]:
index_del = pd.read_csv("../Datasets/Index_del.csv")

# Dataset Creation

The following datasets have been prepared for various stages of the experimental pipeline:

- **Masked Language Modeling (MLM) Evaluation:**  
  A separate dataset of 20K abstracts, not overlapping with the DAPT datasets, used to evaluate the model's performance on the MLM task for SciBERT model.

- **Fine-Tuning:**  
  A residual dataset, composed of all remaining abstracts not used in DAPT or MLM evaluation, reserved for downstream fine-tuning tasks (e.g., classification).


In [4]:

# Parameters
DAPT_SAMPLE_SIZE = 100_000      # Total abstracts to sample for main dataset
SAMPLE_10K = 10_000             # Size of smaller subset from 100K
SAMPLE_50K = 50_000             # Size of larger subset from 100K 
VAL_SAMPLE_SIZE = 20_000        # Validation dataset size from outside the 100K
SEED = 123                     # Random seed for reproducibility

random.seed(SEED)

indeces_to_delete = index_del['indices']

# delete indices from df
df = df.drop(index=indices_to_delete, errors='ignore').reset_index()
N = len(df)  # Total number of abstracts available

# Step 1: Randomly sample 100K abstracts for the main domain-adaptive pretraining dataset
indices_100k = set(random.sample(range(N), DAPT_SAMPLE_SIZE))

# Step 2: From the 100K dataset, randomly select 10K abstracts
indices_10k = set(random.sample(indices_100k, SAMPLE_10K))

# Step 3: From the remaining 90K (100K minus 10K), select 50K abstracts
indices_50k = set(random.sample(indices_100k, SAMPLE_50K))

# Step 4: From the abstracts NOT in the 100K sample, randomly select 20K for validation using MLM
remaining_after_100k = set(range(N)) - indices_100k
indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))

# Step 5: The remaining abstracts not selected in any of the above datasets for Fine-Tuning
indices_remaining = set(range(N)) - indices_100k - indices_10k_val

# Create the five datasets using the selected indices
df_100k = df.loc[list(indices_100k)].reset_index()
df_10k = df.loc[list(indices_10k)].reset_index()
df_50k = df.loc[list(indices_50k)].reset_index()
df_10k_val = df.loc[list(indices_10k_val)].reset_index()
df_remaining = df.loc[list(indices_remaining)].reset_index()

since Python 3.9 and will be removed in a subsequent version.
  indices_10k = set(random.sample(indices_100k, SAMPLE_10K))
since Python 3.9 and will be removed in a subsequent version.
  indices_50k = set(random.sample(indices_100k, SAMPLE_50K))
since Python 3.9 and will be removed in a subsequent version.
  indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))


In [6]:
# 1. Load datasets
df = pd.read_csv("../Datasets/cleaned_dataset.csv", encoding='utf-8')
index_del = pd.read_csv("../Datasets/Index_del.csv")

# Example: take 20k validation samples from remaining_after_100k
VAL_SAMPLE_SIZE = 20_000

# Make sure 'remaining_after_100k' is defined earlier
indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))

# The remaining abstracts not selected in any of the above datasets for Fine-Tuning
indices_remaining = set(range(N)) - indices_100k - indices_10k_val
                  
df_10k_val = df.loc[list(indices_10k_val)].reset_index(drop=True)
df_remaining = df.loc[list(indices_remaining)].reset_index(drop=True)

# 2. Initialize SciBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# 3. Tokenization function
class TokenizerSegmenter:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def process(self, abstracts):
        """
        Tokenize abstracts using SciBERT tokenizer with truncation & padding.
        """
        encodings = self.tokenizer(
            list(abstracts),
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return encodings



[nltk_data] Downloading package punkt to /home/carmine-
[nltk_data]     landolfi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
since Python 3.9 and will be removed in a subsequent version.
  indices_10k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))


In [7]:
prep = TokenizerSegmenter(tokenizer, max_length=300)  
prep_10k_val = prep.process(df_10k_val["abstract_clean"])
prep_remaining = prep.process(df_remaining["abstract_clean"])

print(prep_10k_val["input_ids"].shape)
print(prep_10k_val["attention_mask"].shape)

torch.Size([20000, 300])
torch.Size([20000, 300])


In [8]:
# Create a DataFrame containing the cleaned abstracts and tokenized inputs
df_10k_val = pd.DataFrame({
    "abstract_clean": df_10k_val["abstract_clean"],
    "input_ids": prep_10k_val["input_ids"].tolist(),
    "attention_mask": prep_10k_val["attention_mask"].tolist()
})

df_remaining = pd.DataFrame({
    "abstract_clean": df_remaining["abstract_clean"],
    "primary_category": df_remaining["primary_category"],
    "input_ids": prep_remaining["input_ids"].tolist(),
    "attention_mask": prep_remaining["attention_mask"].tolist()
})

In [9]:
# Save datasets to CSV files with headers
df_10k_val.to_csv('../Datasets/dataset_10k_val_SciBERT.csv', header=True, index = False)
df_remaining.to_csv('../Datasets/dataset_remaining_SciBERT.csv', header=True, index = False)