![](https://i.postimg.cc/rm2vJ3FD/Screenshot-2025-06-25-212115.png)

# Youtube Video Link -->> https://youtu.be/Dous6pBrYbc

In [1]:
# Install required libraries

# !pip install -q datasets transformers accelerate transformers[sentencepiece] sacrebleu rouge_score py7zr


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.4/96.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.3/141.3 kB[0m [31m10.2 

# 📦 1. Import Libraries & Suppress Warnings

In [14]:
from datasets import load_dataset  # Load dataset
import torch  # PyTorch tensors & GPU
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # Automatically picks the right tokenizer & model
from transformers import DataCollatorForSeq2Seq  # Dynamic padding and batching
from transformers import TrainingArguments, Trainer  # Training setup & loop
from transformers import pipeline  # High-level API for easy inference
import warnings  # Handle warnings
warnings.filterwarnings("ignore")  # Suppress warnings

# 🤖 2. Load Model & Tokenizer

In [15]:

model_checkpoint = "t5-small"  # ✅ You can also use "google/flan-t5-base", "facebook/bart-base", etc.

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")

# ===== Popular Summarization Models Sorted by Parameters (Smallest → Largest) =====

# "t5-small" (60M params) (add "summarize:" before each dialogue) — ✅ fastest to train
# "google/flan-t5-small" (80M params) (add "summarize:" before each dialogue) — ⚡ fast, generalizes well
# "facebook/bart-base" (139M params) (no "summarize:" needed) — 📈 solid performance
# "sshleifer/distilbart-cnn-12-6" (139M params) (no "summarize:" needed) — 🔁 faster BART variant
# "t5-base" (220M params) (add "summarize:" before each dialogue) — 🧠 good quality, moderate speed
# "google/flan-t5-base" (250M params) (add "summarize:" before each dialogue) — 🚀 better than t5-base in low-data settings
# "facebook/bart-large" (406M params) (no "summarize:" needed) — 💎 strong quality, slower
# "google/pegasus-cnn_dailymail" (568M params) (no "summarize:" needed) — 🦾 very good for abstractive summarization
# "t5-large" (770M params) (add "summarize:" before each dialogue) — 🐢 slow, high-quality

# ===== Dataset Prep Summary =====
# T5 / FLAN-T5 → add "summarize: " before each dialogue
# BART / DistilBART / Pegasus → use raw dialogue (no prefix)
# All → tokenize dialogue (max_length=1024), tokenize summary (max_length=128)



# 📚 3. Load SAMSum Dataset


In [16]:
# Load SAMSum Dataset
dataset = load_dataset("knkarthick/samsum")

# ✂️ 4. Tokenize the Dataset

In [17]:
# Tokenize the Dataset
def tokenize_content(data):
    dialogues = data["dialogue"]
    summaries = data["summary"]

    inputs = ["summarize: " + d if d else "summarize: " for d in dialogues]
    targets = [s if s else "" for s in summaries]

    input_encoding = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        target_encoding = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": target_encoding["input_ids"],
    }

tokenized_dataset = dataset.map(tokenize_content, batched=True)


Map:   0%|          | 0/818 [00:00<?, ? examples/s]

# 🧱 5. Setup Data Collator

In [18]:
# Setup Data Collator
seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ⚙️ 6. Define Training Arguments

In [19]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="t5-samsum-model",              # Where to save the model
    num_train_epochs=1,                        # Number of training passes over data
    per_device_train_batch_size=1,             # Samples per GPU during training
    per_device_eval_batch_size=1,              # Samples per GPU during evaluation
    warmup_steps=500,                          # Gradually increase LR for first 500 steps
    weight_decay=0.01,                         # Regularization to prevent overfitting
    logging_steps=10,                          # Log training metrics every 10 steps
    eval_steps=500,                            # Run evaluation every 500 steps
    save_steps=1e6,                            # Disable auto-saving during training
    gradient_accumulation_steps=16,            # Accumulate gradients for larger batch effect
    report_to="none"                           # Disable logging to external tools
)


# 🏋️ 7. Initialize Trainer

In [20]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)


# 🚀 8. Train the Model

In [21]:
# Train the Model
trainer.train()


Step,Training Loss
10,11.7585
20,11.838
30,11.8413
40,11.1904
50,10.4158
60,9.7484
70,9.274
80,8.2405
90,6.7008
100,5.6987


TrainOutput(global_step=920, training_loss=1.6631328080011452, metrics={'train_runtime': 1144.6114, 'train_samples_per_second': 12.871, 'train_steps_per_second': 0.804, 'total_flos': 3984462635335680.0, 'train_loss': 1.6631328080011452, 'epoch': 0.9991854466467553})

# 💾 9. Save Model & Tokenizer

In [22]:
#  Save Model & Tokenizer
model.save_pretrained("t5_samsum_finetuned_model")
tokenizer.save_pretrained("t5_samsum_tokenizer")


('t5_samsum_tokenizer/tokenizer_config.json',
 't5_samsum_tokenizer/special_tokens_map.json',
 't5_samsum_tokenizer/spiece.model',
 't5_samsum_tokenizer/added_tokens.json',
 't5_samsum_tokenizer/tokenizer.json')

# 🔁 10. Reload & Setup for Inference

In [23]:
#  Reload & Setup for Inference
tokenizer = AutoTokenizer.from_pretrained("t5_samsum_tokenizer")
model = AutoModelForSeq2SeqLM.from_pretrained("t5_samsum_finetuned_model").to("cuda")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
# No manual tokenization, no manual model.generate() — it abstracts all that under the hood.

Device set to use cuda:0


# 🎭 11. Test Sample Dialogue (Luffy & Naruto)

In [24]:
# 🎭 11. Test Sample Dialogue (Luffy & Naruto)
sample_text = '''Luffy: Naruto! You won the ramen eating contest again?! That’s your fifth win this month!

Naruto: Believe it, Luffy! Ichiraku’s secret menu is my new training ground. Gotta keep up the chakra and the appetite!

Luffy: Haha! I like that! I trained by eating 20 meat-on-the-bone last night. Zoro thought I was insane.

Naruto: Bro, I’ve fought Akatsuki, and even I think that’s dangerous. What’s next? Competing with Goku?

Luffy: Maybe! But first I wanna become the Pirate King. Then I’ll eat ramen on the moon!

Naruto: You sure talk big, rubber boy. But I respect that. Becoming Hokage wasn’t easy either.

Luffy: We’re kinda the same, huh? Chasing dreams, fighting crazy villains, making loyal friends.

Naruto: True that. Though I don’t have a reindeer doctor or a skeleton with an afro.

Luffy: And I don’t have a giant fox inside me. We’re even!

Naruto: Hey, wanna team up for a mission? I heard there’s a lost treasure in the Hidden Mist village.

Luffy: Treasure?! I’m in! Let’s go find it, and maybe snack along the way.

Naruto: Deal. I’ll bring the kunai, you bring the appetite.

Luffy: This is gonna be epic! Let's GO!!!

Naruto: Dattebayo!!!'''


# 📄 12. Show the Summary Output

In [25]:
# 📄 12. Show the Summary Output
from IPython.display import Markdown, display
result = summarizer(sample_text, max_length=100, min_length=30, do_sample=False) ## do_sampilng = False means Use greedy decoding (no randomness); always returns same result
display(Markdown(f"**Summary:** {result[0]['summary_text']}"))
# result format -->> [{'summary_text': 'Here is the generated summary.'}]


**Summary:** Luffy won the ramen eating contest again this month. Luffy is training with 20 meat-on-the-bone. Naruto has fought Akatsuki, and he will compete with Goku.