In [None]:
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

# Download sentence tokenizer
nltk.download("punkt")


In [None]:
# Pegasus model for text summarization

In [None]:
# Set device for computation (GPU preferred)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

# Load PEGASUS model and tokenizer
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


In [None]:
# Load dataset from disk
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

In [None]:
# process 

In [None]:
def preprocess_batch(batch):
    """
    Tokenizes a batch of dialogue–summary pairs for PEGASUS.
    Converts raw text into input IDs and labels that the model can process.
    """
    # Tokenize dialogues (model inputs)
    input_encodings = tokenizer(
        batch['dialogue'],
        max_length=1024,
        truncation=True,
        padding="max_length"
    )

    # Tokenize summaries (model targets)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(
            batch['summary'],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    # Return tokenized input and labels
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }


In [None]:
# tokenize data
# Apply preprocessing to all splits
tokenized_dataset = dataset_samsum.map(
    preprocess_batch, 
    batched=True
)

# Check a sample to confirm encoding
tokenized_dataset["train"][0]


In [None]:
# training

In [None]:
from transformers import pipeline

# Initialize summarization pipeline with our PEGASUS model
summarizer = pipeline(
    "summarization",
    model=model_pegasus,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test summarization on one dialogue
sample_text = dataset_samsum["test"][1]["dialogue"]
print("Original Dialogue:\n", sample_text)
summary = summarizer(sample_text, min_length=30, max_length=100)[0]['summary_text']
print("\nGenerated Summary:\n", summary)


In [None]:
# evaluation

In [None]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")


# Compute ROUGE scores on a few samples
for idx in range(3):
    dialogue = dataset_samsum["test"][idx]["dialogue"]
    reference = dataset_samsum["test"][idx]["summary"]
    prediction = summarizer(dialogue, min_length=30, max_length=100)[0]["summary_text"]
    
    print(f"\nDialogue {idx+1}:")
    print("Generated Summary:", prediction)
    print("Reference Summary:", reference)

    scores = rouge.compute(predictions=[prediction], references=[reference])
    print("ROUGE Scores:", scores)
