In [8]:
from datasets import load_dataset
from transformers import PegasusXForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
import torch
import numpy as np
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = './results'
model_name = 'google/pegasus-x-base'
text_path = 'data/texts.txt'
label_path = 'data/labels.txt'
num_epochs = 450

In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [66]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PegasusXForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
dataset = load_dataset('xsum')
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

In [10]:
with open(text_path, 'r', encoding='utf-8') as fp:
    train_texts = fp.read().split('\n')

with open(label_path, 'r', encoding='utf-8') as fp:
    train_labels = fp.read().split('\n')

In [9]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
inputs = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)
labels = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True)
dataset = PegasusDataset(inputs, labels)

In [None]:
model = model.to(torch_device)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,           
    num_train_epochs=num_epochs,          
    per_device_train_batch_size=4,               
    save_total_limit=2,                           
    weight_decay=0.01,               
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()