In [None]:
import pandas as pd

filename = "filename"

df = pd.read_excel(filename,index_col=0)
df.rename(columns = {'data':'source', 'summary':'target'}, inplace = True)
len(df)

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    prepare_val = not (val_texts is None or val_labels is None)
    prepare_test = not (test_texts is None or test_labels is None)

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        decodings = tokenizer(labels, truncation=True, padding=True, max_length=256)
        return PegasusDataset(encodings, decodings)

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset   = tokenize_data(val_texts, val_labels)   if prepare_val else None
    test_dataset  = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset, tokenizer

def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


    if val_dataset is not None:
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=2,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            save_steps=500,
            save_total_limit=5,
            evaluation_strategy='steps',
            eval_steps=100,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer
        )

    return trainer


In [None]:
train_texts, train_labels = (list(df['source'])), (list(df['target']))
  
model_name = 'nsi319/legal-pegasus'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()

In [None]:
import os
if not os.path.exists('./ouput_model/'):
    os.makedirs('./ouput_model/')
trainer.model.save_pretrained("./ouput_model/")