In [None]:
!pip install transformers evaluate rouge-score




In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'newspaper-text-summarization-cnn-dailymail' dataset.
Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


In [None]:
import os
os.listdir(path+'/cnn_dailymail')

['validation.csv', 'train.csv', 'test.csv']

In [None]:
import pandas as pd
import numpy as np

In [None]:
train=pd.read_csv(path+'/cnn_dailymail/train.csv')
test=pd.read_csv(path+'/cnn_dailymail/test.csv')
val=pd.read_csv(path+'/cnn_dailymail/validation.csv')

In [None]:
train.shape, test.shape, val.shape

((287113, 3), (11490, 3), (13368, 3))

In [None]:
train.head(1)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."


In [None]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import evaluate




In [None]:
test_df = test
test_df = test_df[['article', 'highlights']]


In [None]:
model_name = "sshleifer/distilbart-cnn-12-6"

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

model.to("cpu")
model.eval()


In [None]:
def generate_summary(text, max_input_len=512, max_summary_len=128):
    inputs = tokenizer(text,truncation=True,padding="max_length",max_length=max_input_len,return_tensors="pt")
    with torch.no_grad():
        summary_ids = model.generate(inputs["input_ids"],attention_mask=inputs["attention_mask"],max_length=max_summary_len,num_beams=4,early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
sample_article = test_df.iloc[0]['article']
reference_summary = test_df.iloc[0]['highlights']

generated_summary = generate_summary(sample_article)

print("Generated Summary:\n", generated_summary)
print("\nReference Summary:\n", reference_summary)


Generated Summary:
  U.S consumer advisory group set up by the Department of Transportation said that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches .

Reference Summary:
 Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .


In [None]:
#rouge evaluation

rouge = evaluate.load("rouge")

predictions = []
references = []

for i in range(20):  # small batch for CPU
    article = test_df.iloc[i]['article']
    ref = test_df.iloc[i]['highlights']

    pred = generate_summary(article)
    predictions.append(pred)
    references.append(ref)

scores = rouge.compute(predictions=predictions, references=references)
print(scores)


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.44145738324481265), 'rouge2': np.float64(0.21925908709269631), 'rougeL': np.float64(0.34004047539344784), 'rougeLsum': np.float64(0.38686968727016036)}


In [None]:
'''
You are not using a different model.
You are using the same pretrained model, but through two different APIs:

Method	What it is
pipeline("summarization")	High-level abstraction (easy, less control)
AutoTokenizer + AutoModelForSeq2SeqLM	Low-level API (full control)


Pipeline way (High-level, simple)
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6"
)

What happens internally?

The pipeline automatically:
Downloads the tokenizer
Downloads the model
Tokenizes input text

Calls model.generate()

Decodes output tokens to text

You don‚Äôt see these steps, but they happen.
Use when:
You want quick results
You don‚Äôt need fine control
You are doing inference only


Tokenizer + Model way (Low-level, explicit)
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

What YOU control here:

‚úî Tokenization
‚úî Padding & truncation
‚úî Device (CPU/GPU)
‚úî Decoding strategy
‚úî Batch processing
‚úî Training / fine-tuning

You explicitly call:

model.generate(...)


######

3Ô∏è‚É£ Why the code looks different
Pipeline:
summarizer(text)

Manual:
inputs = tokenizer(text, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"])
summary = tokenizer.decode(summary_ids[0])


üìå Same model weights
üìå Same architecture
üìå Different level of abstraction



Why AutoModelForSeq2SeqLM?
Because BART is:

Encoder‚ÄìDecoder

Sequence-to-Sequence

So Hugging Face maps it to:

AutoModelForSeq2SeqLM


If it were:

BERT ‚Üí AutoModel

GPT ‚Üí AutoModelForCausalLM

T5 ‚Üí AutoModelForSeq2SeqLM

üìå Auto* classes automatically load the correct architecture.

‚ÄúThe pipeline API is a high-level wrapper that internally loads the tokenizer
and model, while using AutoTokenizer and AutoModel gives explicit control over
tokenization and generation, although both use the same pretrained model.‚Äù
'''