In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
#Importing the libraries

from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

#Pre trained summarization model

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
#dowload & unzip data

!wget https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
!unzip summarizer-data.zip

#Samsum dataset containing 16k dialouges and summaries

In [None]:
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

#Contains train,test and validation data

In [None]:



split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])
     

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }



tokenization is performed on both the '
dialogue' and 'summary' texts. The tokenizer converts the 
text sequences into numerical representations that can be fed 
into a model for training or inference.

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

The map function applies the convert_examples_to_features function to each example in the dataset and returns a new dataset with the transformed examples.

In [None]:
dataset_samsum_pt["train"]


In [None]:
# Training

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=10, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 
     

- `output_dir`: Specifies the output directory where the trained model and other outputs will be saved.

- `num_train_epochs`: Specifies the number of training epochs, i.e., the number of times the entire training dataset will be passed through the model during training.

- `warmup_steps`: Specifies the number of warmup steps. Warmup steps gradually increase the learning rate from 0 to the specified learning rate over a certain number of steps at the beginning of training.

- `per_device_train_batch_size`: Specifies the batch size for training. This argument defines the number of training examples processed together in each forward and backward pass during training. The batch size is specified per device (GPU or TPU).

- `per_device_eval_batch_size`: Specifies the batch size for evaluation. Similar to `per_device_train_batch_size`, this argument defines the batch size for evaluation examples.

- `weight_decay`: Specifies the weight decay for regularization during training. Weight decay is a technique used to prevent overfitting by adding a penalty term to the loss function based on the magnitude of the model's weights.

- `logging_steps`: Specifies the number of steps before logging training metrics. Training metrics such as loss, learning rate, and evaluation metrics will be logged at these intervals.

- `evaluation_strategy`: Specifies the evaluation strategy during training. It can take values like `"steps"` or `"epoch"`. In this case, the evaluation is performed at specified intervals based on the `eval_steps` argument.

- `eval_steps`: Specifies the number of steps before evaluation. Evaluation will be performed at these intervals during training.

- `save_steps`: Specifies the number of steps before saving the model. The model will be saved at these intervals during training.

- `gradient_accumulation_steps`: Specifies the number of steps for gradient accumulation. Gradient accumulation allows updating the model's parameters after accumulating gradients over a certain number of steps, which can be useful when the batch size is limited due to memory constraints.


In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])

In [None]:
trainer.train()

In [None]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    """The function splits the test dataset into batch-sized chunks using the generate_batch_sized_chunks 
    function. For each batch, it tokenizes the input texts using the tokenizer and generates summaries using 
    the pre-trained model. The generated summaries are then decoded and compared with the reference summaries 
    using the metric object. Finally, the function computes and returns the metric score."""
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token with text, and adds the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')
"""Set of evaluation metrics commonly used for assessing the quality of automatic summaries generated by text summarization systems. 
It measures the similarity between the generated summary and one or more reference summaries.
ROUGE scores are based on the concept of n-gram overlap between the generated summary and the reference summaries
"""

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

In [None]:
## Save model
model_pegasus.save_pretrained("pegasus-samsum-model")

In [None]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

In [None]:
#Load

tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
#Prediction

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}



sample_text = dataset_samsum["test"][0]["dialogue"]

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

## 
print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])