In [8]:
from datasets import load_dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import evaluate
import torch
import evaluate
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = './results'
model_name = 'google/pegasus-large'
text_path = 'texts.txt'
label_path = 'labels.txt'

In [12]:
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

Downloading config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 20.2kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 2.28G/2.28G [03:42<00:00, 10.2MB/s]
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading generation_config.json: 100%|██████████| 259/259 [00:00<?, ?B/s] 


In [16]:
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

Downloading tokenizer_config.json: 100%|██████████| 87.0/87.0 [00:00<?, ?B/s]
Downloading spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 5.57MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 8.17kB/s]
Downloading tokenizer.json: 100%|██████████| 3.52M/3.52M [00:00<00:00, 6.13MB/s]


In [66]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
dataset = load_dataset('xsum')
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

In [67]:
inputs = tokenizer(train_texts[200], max_length=512, truncation=True, return_tensors='pt')
summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=50)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

['Plans to give English MPs a veto over laws affecting England would be "irresponsible", Ed Miliband has warned.']

In [58]:
inputs = tokenizer(train_texts[200], max_length=1024, truncation=True, return_tensors='pt')

In [62]:
inputs['input_ids'].size()

torch.Size([1, 1024])

In [55]:
train_texts[200]

'In a Commons debate, the ex-Labour leader said Conservative plans to limit the voting powers of Scottish MPs on Commons laws would "rip up" hundreds of years of parliamentary procedure.\nThe SNP said the Conservatives wanted to create a "quasi-English Parliament".\nBut ministers said it was vital England was treated fairly as further powers were devolved to other parts of the UK.\nAt the end of the debate, Labour staged and won a vote in which the government abstained. And Conservative MP David Davis raised a point of order to urge the government to allow more time for the matter to be considered.\nThe government believes bills applying exclusively to England should not become law without the explicit consent of MPs from English constituencies and it wants to change Commons rules known as standing orders to give them a "decisive say" during their passage.\nMinisters say this will address the longstanding anomaly by which Scottish MPs can vote on issues such as health and education aff

In [44]:
inputs["input_ids"]

tensor([[15458,  2893,   115,  ...,   111,   109,     1]])

In [10]:
with open(text_path, 'r', encoding='utf-8') as fp:
    train_texts = fp.read().split('\n')

with open(label_path, 'r', encoding='utf-8') as fp:
    train_labels = fp.read().split('\n')

In [9]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
inputs = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)
labels = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True)
dataset = PegasusDataset(inputs, labels)

In [None]:
model = model.to(torch_device)

In [None]:
num_epochs = 450

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,           
    num_train_epochs=num_epochs,          
    per_device_train_batch_size=4,               
    save_total_limit=5,                           
    weight_decay=0.01,               
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=train_dataset,
    tokenizer=tokenizer
)