## 1. Library Imports


In this section, we import all the necessary libraries for data preprocessing, tokenization, model training, and evaluation.

In [1]:
import pandas as pd
import random


# 2. Dataset Loading

In this section, we load the dataset of scientific abstracts that were previously collected and cleaned

In [2]:
df = pd.read_csv("../Datasets/cs_papers_api.csv",encoding='utf-8')

# Dataset Creation

The following datasets have been prepared for various stages of the experimental pipeline:

- **Domain-Adaptive Pretraining (DAPT):**  
  Creation of three datasets consisting of:
  - 10K abstracts  
  - 50K abstracts  
  - 100K abstracts  

- **Masked Language Modeling (MLM) Evaluation:**  
  A separate dataset of 20K abstracts, not overlapping with the DAPT datasets, used to evaluate the model's performance on the MLM task.

- **Fine-Tuning:**  
  A residual dataset, composed of all remaining abstracts not used in DAPT or MLM evaluation, reserved for downstream fine-tuning tasks (e.g., classification).


In [None]:

# Parameters
DAPT_SAMPLE_SIZE = 100_000      # Total abstracts to sample for main dataset
SAMPLE_10K = 10_000             # Size of smaller subset from 100K
SAMPLE_50K = 50_000             # Size of larger subset from 100K 
VAL_SAMPLE_SIZE = 20_000        # Validation dataset size from outside the 100K
SEED = 123                     # Random seed for reproducibility

random.seed(SEED)

N = len(dataset)  # Total number of abstracts available

# Step 1: Randomly sample 100K abstracts for the main domain-adaptive pretraining dataset
indices_100k = set(random.sample(range(N), DAPT_SAMPLE_SIZE))

# Step 2: From the 100K dataset, randomly select 10K abstracts
indices_10k = set(random.sample(indices_100k, SAMPLE_10K))

# Step 3: From the remaining 90K (100K minus 10K), select 50K abstracts
indices_50k = set(random.sample(indices_100k, SAMPLE_50K))

# Step 4: From the abstracts NOT in the 100K sample, randomly select 20K for validation using MLM
remaining_after_100k = set(range(N)) - indices_100k
indices_20k_val = set(random.sample(remaining_after_100k, VAL_SAMPLE_SIZE))

# Step 5: The remaining abstracts not selected in any of the above datasets for Fine-Tuning
indices_remaining = set(range(N)) - indices_100k - indices_20k_val

# Create the five datasets using the selected indices
df_100k = df_pre.select(list(indices_100k))
df_10k = df_pre.select(list(indices_10k))
df_50k = df_pre.select(list(indices_50k))
df_20k_val = df_pre.select(list(indices_20k_val))
df_remaining = df_pre.select(list(indices_remaining))

# Save datasets to CSV files with headers
df_100k.write.csv('../Datasets/dataset_100k.csv', header=True)
df_10k.write.csv('../Datasets/dataset_10k.csv', header=True)
df_50k.write.csv('../Datasets/dataset_50k.csv', header=True)
df_20k_val.write.csv('../Datasets/dataset_20k_val.csv', header=True)
df_remaining.write.csv('../Datasets/dataset_remaining.csv', header=True)

    