In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk

from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:

device = "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
!wget https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
!unzip summarizer-data.zip

In [None]:
dataset_samsum = load_from_disk("samsum_dataset")
dataset_samsum

In [None]:
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")

column_names = dataset_samsum.get("train").column_names

print(f"Features: {column_names}")

print("\nDialogues:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

In [None]:
def convert_examples_to_features(example_batch):
  input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

  with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)
  
  return {
      "input_ids": input_encodings["input_ids"],
      "attention_mask": input_encodings["attention_mask"],
      "labels": target_encodings["input_ids"],
  }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

In [None]:
dataset_samsum_pt["train"]

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
  output_dir="pegasus-samsum",
  num_train_epochs=1,
  warmup_steps=500,
  per_device_train_batch_size=1,
  per_device_eval_batch_size=1,
  weight_decay=0.01,
  logging_steps=10,
  evaluation_strategy="steps",
  eval_steps=500,
  save_steps=1e6,
  gradient_accumulation_steps=16
)

In [None]:
trainer = Trainer(
  model = model_pegasus,
  args = trainer_args,
  train_dataset = dataset_samsum_pt["test"],
  eval_dataset = dataset_samsum_pt["validation"],
  data_collator = seq2seq_data_collator,
  tokenizer = tokenizer,
)

In [13]:
trainer.train()