In [None]:
!pip install datasets
!pip uninstall accelerate transformers[torch] -y
!pip install accelerate transformers[torch] -U
!pip install rouge_score py7zr
import pandas as pd



In [None]:
# load and prepare the cnn_dailymail dataset
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(f"Features: {dataset['train'].column_names}")
print(f"{len(dataset['test'])}")
print(f"{dataset['train']['article'][0][:500]}")
print(f"Summary: {dataset['train']['highlights'][0]}")
sample_text = dataset['train']['article'][1][:2000]


In [None]:
# defining baseline summary function
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

from datasets import load_metric
rouge_metric = load_metric("rouge", trust_remote_code=True)

def evaluate_summaries_baseline(dataset, metric, column_text='article', column_summary='highlights'):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    scores = metric.compute()
    return scores

test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))
score = evaluate_summaries_baseline(test_sampled, rouge_metric)
print(score)


In [None]:
# evaluating summaries using pegasus
from tqdm import tqdm
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i+batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device,
                               column_text="article", column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        torch.cuda.empty_cache() # clear the cache at the end of each iteration to avoid running out of memory when training on a standard GPU
        try:
            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                       attention_mask=inputs["attention_mask"].to(device),
                                       length_penalty=0.8, num_beams=8, max_length=128)
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
            decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
        except RuntimeError as e:
            if 'out of memory' in str(e):
                print("Out of memory error occurred. Skipping this batch.")
                torch.cuda.empty_cache()
    score = metric.compute()
    return score

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

rouge_metric = load_metric("rouge")



In [None]:
# loading the samsum (conversations and their summaries dataset by Samsung)
dataset_samsum = load_dataset("samsum", trust_remote_code=True)
print(dataset_samsum["test"][0]["dialogue"])
print(dataset_samsum["test"][0]["summary"])

from transformers import pipeline
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(dataset_samsum["test"][0]["dialogue"])
print("Summary:")
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n"))


In [None]:
# visualizing the conversations and their summary token distribution
import matplotlib.pyplot as plt

d_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"]["dialogue"]]
s_len = [len(tokenizer.encode(s)) for s in dataset_samsum["train"]["summary"]]

fig, axes = plt.subplots(1, 2, figsize=(10, 3.5), sharey=True)
axes[0].hist(d_len, bins=20, color="C0", edgecolor="C0")
axes[0].set_title("Dialogue Token Length")
axes[0].set_xlabel("Length")
axes[0].set_ylabel("Count")
axes[1].hist(s_len, bins=20, color="C0", edgecolor="C0")
axes[1].set_title("Summary Token Length")
axes[1].set_xlabel("Length")
plt.tight_layout()
plt.show()


In [None]:
# converting the tokens into features
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True)
    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)

In [None]:
# defining the data collator which will automatically handle padding to ensure uniformity of the input and output
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# defining the training arguments
training_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16 # the model has one batch which makes it hard to converge therefore we introduce this gradient accumulator as an alternative way to calculate gradient

)


In [None]:
# training the model
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt["train"],
    eval_dataset=dataset_samsum_pt["validation"]
)
trainer.train()

In [None]:
# evaluating the model
score = evaluate_summaries_pegasus(
    dataset_samsum["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="dialogue", column_summary="summary"
)

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
print(pd.DataFrame(rouge_dict, index=[f"pegasus"]))

In [None]:
# testing the model on the samsum dataset
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]
pipe = pipeline("summarization", model="pegasus-samsum") # this is the name of your trained model
print("Dialogue:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])


In [None]:
# testing the model on custom dialogue
my_message="""\

Person A: Hey everyone! How's it going?

Person B: Hi! I'm doing well, just finished a big project at work. What about you?

Person C: Hey! Congrats on finishing your project, B. I've been busy with some personal stuff, but it's all good now. How about you, A?

Person A: Thanks, C! I'm doing great. Just got back from a short trip to the mountains. It was so refreshing. How are you doing, D?

Person D: Hi all! I'm doing fine, just dealing with some house renovations. It's a bit chaotic, but I'm excited to see the final result.

Person B: That sounds exciting, D. What kind of renovations are you doing?

Person D: We're redoing the kitchen and adding a small patio in the backyard. It's a lot of work, but I think it'll be worth it.

Person C: That sounds amazing! I love spending time outdoors, so a patio sounds perfect. Maybe we can have a get-together there once it's done?

Person D: Absolutely! I'd love that. We could have a barbecue or something.

Person A: Count me in! Speaking of get-togethers, we haven't had one in a while. Maybe we should plan something soon.

Person B: Yes, we should. How about a game night? We could all bring our favorite games and snacks.

Person C: I love that idea! I'm always up for a good game night. Let's set a date.

Person D: How about next Saturday? Does that work for everyone?

Person A: Next Saturday works for me. What about you, B and C?

Person B: Works for me too!

Person C: Same here! Looking forward to it.

Person D: Great! It's a plan then. I'll make sure to have the patio ready by then.

Person A: Awesome! This is going to be so much fun. Can't wait to see everyone.

Person B: Me too! It's going to be a blast.

Person C: Definitely. See you all next Saturday!

Person D: See you then!
"""
print(pipe(my_message, **gen_kwargs)[0]["summary_text"])