In [1]:
!nvidia-smi

Sat Jun 21 14:24:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.07                 Driver Version: 566.07         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   55C    P8              3W /   80W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install -U transformers[sentencepiece] datasets fsspec evaluate huggingface_hub sacrebleu rouge_score py7zr -q

In [3]:
#Just to disable the weights and biases
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
"""
!pip install --upgrade accelerate
!pip uninstall -v transformers accelerate
!pip install transformers accelerate
"""

'\n!pip install --upgrade accelerate\n!pip uninstall -v transformers accelerate\n!pip install transformers accelerate\n'

In [5]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/pegasus-cnn_dailymail"

# Loading Tokeinzer and Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Load Dataset [Samsum conversations dataset]
dataset = load_dataset("knkarthick/samsum")

# Remove Empty conversations
def is_valid_example(example):
    return isinstance(example["dialogue"], str) and isinstance(example["summary"], str)

valid_dataset = dataset.filter(is_valid_example)


# Tokenize
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)

    target_encodings = tokenizer(example_batch['summary'],max_length=128,truncation=True)

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings["input_ids"]
    }

tokenized_dataset = valid_dataset.map(convert_examples_to_features, batched=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [6]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})

In [7]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [8]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir = 'pegasus-samsum', num_train_epochs =1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    eval_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
trainer = Trainer(model=model, args=trainer_args, tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=tokenized_dataset["test"],
                  eval_dataset = tokenized_dataset["validation"])

  trainer = Trainer(model=model, args=trainer_args, tokenizer=tokenizer,


In [10]:
trainer.train()



Step,Training Loss,Validation Loss




TrainOutput(global_step=52, training_loss=3.0221930192067075, metrics={'train_runtime': 1918.3008, 'train_samples_per_second': 0.427, 'train_steps_per_second': 0.027, 'total_flos': 314017624350720.0, 'train_loss': 3.0221930192067075, 'epoch': 1.0})

In [11]:
# Evaluate

def generate_batch_sized_chunks(list_of_elements, batch_size):
  """split the dataset into smaller batches that we can process simultaneously
  Yield successive batch-sized chunks from the list_of_elements."""

  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                                batch_size=16, device=device,
                                column_text="article",
                                column_summary="highlights"):
  article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
  target_batches =  list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

  for article_batch, target_batch in tqdm(
      zip(article_batches, target_batches), total=len(article_batches)):

      inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                         padding = "max_length", return_tensors="pt")

      summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                 attention_mask=inputs["attention_mask"].to(device),
                                 length_penalty=0.8, num_beams=8, max_length=128)

      '''parameter for length penalty ensures that the model does not generate sequences'''

      # Finally, we decode the generated texts, replace the token and add the decoded texts with the references to the metric.
      decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                            clean_up_tokenization_spaces=True)
          for s in summaries]

      decoded_summaries = [d.replace(""," ") for d in decoded_summaries]

      metric.add_batch(predictions=decoded_summaries, references=target_batch)

      #Finally compute and return the ROGUE scores
      score = metric.compute()
  return score

In [13]:
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_metric = evaluate.load("rouge")

In [16]:
# Evaluate ROUGE scores on a subset of test set
score = calculate_metric_on_test_ds(
    tokenized_dataset['test'][:10],
    rouge_metric,
    trainer.model,
    tokenizer,
    batch_size=2,
    column_text="dialogue",
    column_summary="summary"
)

# Extract F1 (mid.fmeasure) scores into a dictionary
rouge_dict = {rn: score[rn] for rn in rouge_names}

# Display in DataFrame
pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 5/5 [04:45<00:00, 57.19s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.035278,0.0,0.035278,0.035278


In [17]:
## Save Model
model.save_pretrained("pegasus-samsum-model")

##save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\spiece.model',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [19]:
# Load
tokenizer = AutoTokenizer.from_pretrained("D:/GenAI/Proj-1_Text_Summarizer/tokenizer")

In [20]:
# Prediction
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = tokenized_dataset['test'][0]['dialogue']
reference = tokenized_dataset['test'][0]['summary']

pipe = pipeline("summarization", model='pegasus-samsum-model', tokenizer=tokenizer)

##
print("Dialogue")
print(sample_text)

print("\nReference Summary")
print(reference)

print("\nModel Summary")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])



Device set to use cpu


Dialogue
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary


Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him .
