In [None]:
#!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

**The purpose of accelerate in transformers is:**

* Efficient Model Training & Inference

Helps run models on multiple GPUs, TPUs, or CPUs efficiently.
Optimizes memory usage for large models.
Seamless Device Management

* Automatically moves models and data to the best available hardware (CPU/GPU/TPU).
Simplifies Multi-GPU & Distributed Training

Makes it easier to train large models across multiple devices without complex code changes.

In [None]:
# !pip install --upgrade accelerate
# !pip unistall -y transformers accelerate
# !pip install transformers accelerate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Basic Functionality of the Hugging Face model

In [None]:
from transformers import AutoTokenizer,PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

ARTICLE_TO_SUMMARIZE = ("PG&E stated it scheduled the blackouts in response to forecasts for high winds "
          "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
          "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.")

inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length =1024, return_tensors="pt")

summary_ids = model.generate(inputs["input_ids"])
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


"California's largest electricity provider has turned off power to hundreds of thousands of customers."

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cuda'

# Fine Tuning

In [None]:
model = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model) #loading a tokenizer

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#downloading and unzipping the data
!wget https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
!unzip summarizer-data.zip

--2025-02-27 12:28:04--  https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/entbappy/Branching-tutorial/master/summarizer-data.zip [following]
--2025-02-27 12:28:04--  https://raw.githubusercontent.com/entbappy/Branching-tutorial/master/summarizer-data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘summarizer-data.zip.1’


2025-02-27 12:28:05 (305 MB/s) - ‘summarizer-data.zip.1’ saved [7903594/7903594]

Archive:  summarizer-data.zip
replace samsum-test.csv? [y]es, [n]o

In [None]:
#loading the dataset to fine tune with
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

In [None]:
split_lengths =[len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue: ")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary: ")
print(dataset_samsum["test"][1]["summary"])

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

In [None]:
dataset_samsum_pt['train']

In [None]:
#Training
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    run_name='pegasus-samsum-run1',# Directory to save model checkpoints
    num_train_epochs=10,             # Number of times to iterate over the entire dataset
    warmup_steps=500,               # Steps for learning rate warmup before training starts
    per_device_train_batch_size=1,  # Batch size during training (per device/GPU)
    per_device_eval_batch_size=1,   # Batch size during evaluation (per device/GPU)
    weight_decay=0.01,              # L2 regularization to prevent overfitting
    logging_steps=10,               # Log metrics every 10 steps
    evaluation_strategy='steps',    # Evaluation occurs at specific steps (not after each epoch)
    eval_steps=500,                 # Run evaluation every 500 steps
    save_steps=1e6,                 # Save model checkpoint after 1 million steps (effectively never during training)
    gradient_accumulation_steps=16  # Accumulate gradients over 16 steps before updating weights
)


In [None]:
trainer = Trainer(model = model_pegasus,
                  args = trainer_args,
                  tokenizer =tokenizer,data_collator=seq2seq_data_collator,
                  train_dataset =dataset_samsum_pt['test'],
                  eval_dataset = dataset_samsum_pt['validation'])


In [None]:
trainer.train()

In [None]:
from codecs import decode
# evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""

    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):


        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                          length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score



In [None]:
!pip install evaluate

In [None]:
import evaluate

rouge_metric =evaluate.load("rouge")
rouge_names = ["rouge1","rouge2","rougeL","rougeLsum"]

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

In [None]:
#Directly use the scoere withput accessing f easure or mid
rouge_dict ={rn:score[rn] for rn in rouge_names}

#convert the dictionary to a dataframe for easy visualization
import pandas as pd

pd.DataFrame(rouge_dict,index=[f'pegasus'])