In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr 

In [None]:
from transformers import pipeline, set_seed
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from tqdm import tqdm


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
model_link = "google/pegasus-cnn_dailymail"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_link)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_link).to(device)

In [None]:
dataset_samsum = load_dataset("Samsung/samsum")

In [8]:
def generate_batch_size(data, batch_size):
    """
    split dataset into smaller batches
    """
    
    for i in range(0, len(data), batch_size):
        yield data[i: i+batch_size]

In [None]:
rouge_name = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_metric = load_metric('rouge')

In [47]:
def calculate_metric_on_testds(dataset, metric, model, tokenizer, batch_size, device, text, target):
    """
    Calculate rouge metric on test dataset
    Args:
        dataset (Dataset): the dataset to evaluate
        metric (Metric): the metric to calculate
        model (nn.Module): the model to evaluate
        tokenizer (Tokenizer): tokenizer using for text processing
        batch_size (int): the batch size for evaluate
        device : cuda 
        text (str): name of text column in the dataset
        target (str): name of summary column in the dataset
    
    Returns:
        The calculated metric scores
    """
    
    text_batch = list(generate_batch_size(dataset[text], batch_size))
    target_batch = list(generate_batch_size(dataset[target], batch_size))
    for text, target in tqdm(zip(text_batch, target_batch), total= len(text_batch)):
        inputs = tokenizer(text, max_length= 1024, truncation= True, padding="max_length", return_tensors="pt")
        summary = model.generate(input_ids = inputs["input_ids"].to(device),
                                 attention_mask = inputs['attention_mask'].to(device),
                                 max_length = 128)
        decoded_summary = [tokenizer.decode(s, skip_special_tokens= True, clean_up_tokenization_spaces=True) for s in summary]
        decoded_summary = [d.replace("<n>", " ") for d in decoded_summary]
        
        metric.add_batch(predictions = decoded_summary, references = target)
        
    score = metric.compute()
    return score

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model= model_pegasus)

In [11]:
def data_to_vector(data_batch):
    input_encodings = tokenizer(data_batch['dialogue'], max_length=1024, truncation= True, padding = "max_length")
    target_encodings = tokenizer(data_batch['summary'], max_length = 128, truncation= True, padding = "max_length")
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels' : target_encodings['input_ids']
    }

In [None]:
data = dataset_samsum.map(data_to_vector, batched= True)

In [13]:
from transformers import TrainingArguments, Trainer

In [14]:
trainer_args = TrainingArguments(
    output_dir = 'finetune-samsum', num_train_epochs = 1, warmup_steps = 200,
    per_device_train_batch_size = 1, per_device_eval_batch_size = 1,
    weight_decay = 0.01, evaluation_strategy = 'steps', 
    eval_steps = 500, save_steps = 100000,
    logging_steps = 10,
    gradient_accumulation_steps = 16,
    #remove_unused_columns = False
)


In [None]:
trainer = Trainer(model = model_pegasus, args = trainer_args,
                 tokenizer = tokenizer, data_collator = data_collator,
                 train_dataset = data['train'],
                 eval_dataset = data['validation'])

In [None]:
trainer.train()

# The score of model on test dataset

In [None]:
score = calculate_metric_on_testds(
    dataset_samsum['test'], rouge_metric, model_pegasus, tokenizer, batch_size = 2, device = device, text = 'dialogue', target= 'summary'
)



In [49]:
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_name )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.426963,0.199139,0.339815,0.339946


# Push to huggingface

In [None]:
!pip install huggingface_hub --q

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model_pegasus.push_to_hub(...)

In [None]:
tokenizer.push_to_hub(...)