In [None]:
9!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:0

### Importing necessary libraries

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset


In [None]:
# Setting the device agnostic code
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### Using Pegasus for Sequence-to-Sequence Tasks
 **Pegasus** model for sequence-to-sequence tasks such as text summarization. We will load the necessary model and tokenizer, and prepare the model for inference.




In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = "google/pegasus-cnn-dailymail"
tokenizer = AutoTokenizer.from_pretrained(model)
model_pegas = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

### Loading and tokenizing the dataset

In [None]:
samsum_ds = load_dataset("samsum")

In [None]:
samsum_ds

In [None]:
samsum_ds['train']['dialog'][1]

In [None]:
samsum_ds['train']['summary']

In [None]:
split_ds = [len(samsum_ds[split]) for split in samsum_ds]
print(f"Split lengths: {split_ds}")
print(f"Features: {samsum_ds['train'].column_names}")
print("\nDialogue:")
print(samsum_ds["test"][1]["dialogue"])
print("\nSummary:")
print(samsum_ds["test"][1]["summary"])

In [None]:
def convert_to_features(batch):
    input_encodings = tokenizer(batch['dialogue'], truncation=True, padding="max_length", max_length=1024)

    with tokenizer.as_target_tokenizer():
        target_encoding = tokenizer(batch['summary'], truncation=True, padding="max_length", max_length=128)

    batch = {**input_encodings, "labels": target_encoding["input_ids"]}

    return batch

In [None]:
samsum_ds_pt = samsum_ds.map(convert_to_features, batched = True)

In [None]:
samsum_ds_pt["train"]

In [None]:
samsum_ds_pt["train"]["input_ids"][1]

In [None]:
samsum_ds_pt["train"]["attention_mask"][1]

### Padding data with datacollatorforseq2seq

In [None]:
# Training
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegas)

### Training the data

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir = 'pegasus-samsum',
    num_train_epochs = 1,
    warmup_steps = 500,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay = 0.01,
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps = 500,
    save_steps = 1e6,
    gradient_accumulation_steps = 16
)

In [None]:
trainer = Trainer(model = model_pegas,
                  args = trainer_args,
                  tokenizer = tokenizer,
                  data_collator = seq2seq_data_collator,
                  train_dataset = samsum_ds_pt["test"],
                  eval_dataset = samsum_ds_pt["validation"])

In [None]:
trainer.train()

In [None]:
def generate_batch_size_chunks(input_text,batch_size):

  for i in range(0, len(input_text), batch_size):
    yield input_text[i : i + batch_size]

### Evaluating the data

In [None]:
def calculate_metric_test(dataset,
                          metric,
                          model,
                          tokenizer,
                          device = device,
                          batch_size = 16,
                          column_text = 'articls',
                          column_summary = 'highlights'):

  article_batches = list(generate_batch_size_chunks(dataset[column_text],batch_size))
  target_batches = list(generate_batch_size_chunks(dataset[column_summary],batch_size))

  for article_batches,target_batches in tqdm(
      zip(article_batches, target_batches), total = len(article_batches)):

      input = tokenizer(article_batch,
                        max_length = 1024,
                        truncation = True,
                        padding = 'max_length',
                        return_tensors = 'pt')

      summaries = model.generate(input_ids = inputs['input_ids'].to(device),
                                 attention_mask = inputs['attention_mask'].to(device),
                                 length_penalty = 0.8,
                                 num_beams = 8,
                                 max_length = 128,
                                 length_penalty = 0.8)

      decoded_summaries = [tokenizer.decode(s,
                                            skip_special_tokens = True,
                                            clean_up_tokenization_spaces = True)
                            for s in summaries]

      decoded_summaries = [d.eplace(""," ") for d in decoded_summaries]

      metric.add_batch(predictions = decoded_summaries,
                       references = target_batch)

  score = metric.compute()
  return score


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

### Saving the model

In [None]:
model_pegasus.save_pretrained("pegasus-samsum-model")

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
#Prediction
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_text = dataset_samsum["test"][0]["dialogue"]

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])