#### Import required packages

In [1]:
import os
import pandas as pd
from datasets import Dataset
from transformers import PegasusTokenizer

#### Define Paths

In [2]:
PROCESSED_TRAIN_PATH = r'..\..\Data\Processed\english_train_cleaned.csv'
PROCESSED_TEST_PATH  = r'..\..\Data\Processed\english_test_cleaned.csv'
TOKENIZED_OUT_DIR = r'..\..\Data\Processed\tokenized_pegasus_english'

#### Load Cleaned Data

In [3]:

train_df = pd.read_csv(PROCESSED_TRAIN_PATH)
test_df = pd.read_csv(PROCESSED_TEST_PATH)

print(f"Loaded train: {train_df.shape} | test: {test_df.shape}")

Loaded train: (28225, 2) | test: (2889, 2)


#### Initialize PEGASUS Tokenizer

In [None]:

tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# Define Tokenization Function
def preprocess_function(batch):
    model_inputs = tokenizer(
        batch["Article"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["Summary"],
            max_length=64,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#### Convert to Hugging Face Dataset

In [5]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

#### Tokenize (Batched)

In [None]:
print("Tokenizing train set...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["Article", "Summary"]
)
print("Tokenizing test set...")
tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["Article", "Summary"]
)

print("Tokenization complete!")
print(f"Sample keys: {list(tokenized_train.features.keys())}")

#### Save Tokenized Datasets

In [None]:
os.makedirs(TOKENIZED_OUT_DIR, exist_ok=True)
tokenized_train.save_to_disk(os.path.join(TOKENIZED_OUT_DIR, "train"))
tokenized_test.save_to_disk(os.path.join(TOKENIZED_OUT_DIR, "test"))