In [8]:
from datasets import load_dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import evaluate
import torch
import evaluate
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = './results'
model_name = 'google/pegasus-large'
text_path = 'texts.txt'
label_path = 'labels.txt'

In [12]:
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
with open(text_path, 'r', encoding='utf-8') as fp:
    train_texts = fp.read().split('\n')

with open(label_path, 'r', encoding='utf-8') as fp:
    train_labels = fp.read().split('\n')

In [5]:
train_texts[106]

'The wooden building is at Abersoch on the Llŷn Peninsula in Gwynedd. Measuring just 13ft by 9ft, it has no electricity or water - and you are banned from sleeping in it overnight. For the same price just a few miles away you could snap-up a two-bedroom house in the village of Llanbedrog - or even a seven-bedroom terraced house at Tywyn across Cardigan Bay. <mask_1> "It\'s quite incredible. We had two very determined bidders, both from the Cheshire area, who were bidding separately. They were very determined to buy it." The auctioneers said the hut is "in need of some TLC" - but does include part of the beach in front of the hut into the sea. The previous record for the beach huts on the Abersoch sands was £70,000 in 2008.'

In [6]:
label_texts[106]

'"It\'s certainly the highest price ever achieved for a beach hut in Abersoch," remarked Tony Webber, auction surveyor at Beresford Adams Countrywide Auctions. '

In [9]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
inputs = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)
labels = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True)
dataset = PegasusDataset(inputs, labels)

In [None]:
model = model.to(torch_device)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,           
    num_train_epochs=30,          
    per_device_train_batch_size=4,               
    save_total_limit=5,                           
    weight_decay=0.01,               
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=train_dataset,
    tokenizer=tokenizer
)