# BART-Base Fine-tuning for News Summarization (Kaggle GPU)

This notebook is designed to run on Kaggle with GPU acceleration for fine-tuning BART-Base model.

In [None]:
# Install required packages
!pip install -q torch transformers datasets rouge-score bert-score numpy tqdm accelerate sentencepiece

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import Dataset
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import json
from tqdm import tqdm
import os

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Dataset Class

This dataset class handles tokenization for BART models.

## Dataset Class Definition

In [None]:
class NewsSummarizationDataset(Dataset):
    """Dataset class for news summarization"""
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_target_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])
        
        # Tokenize inputs with prompt
        # BART doesn't need prompts, use text directly
        inputs = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize targets
        targets = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }


In [None]:
## Load Dataset

In [None]:
def load_cnn_dailymail(split='test', num_samples=None):
    """Load CNN/DailyMail dataset"""
    print(f"Loading CNN/DailyMail {split} dataset...")
    dataset = load_dataset("cnn_dailymail", "3.0.0", split=split)
    if num_samples:
        dataset = dataset.select(range(min(num_samples, len(dataset))))
    texts = [item['article'] for item in dataset]
    summaries = [item['highlights'] for item in dataset]
    print(f"Loaded {len(texts)} samples")
    return texts, summaries

# Load datasets - adjust num_samples based on your GPU memory
# For Kaggle GPU (16GB), you can use more samples
print("Loading datasets...")
train_texts, train_summaries = load_cnn_dailymail('train', num_samples=5000)
val_texts, val_summaries = load_cnn_dailymail('validation', num_samples=500)
test_texts, test_summaries = load_cnn_dailymail('test', num_samples=200)

print(f"\nDataset sizes:")
print(f"  Training: {len(train_texts)}")
print(f"  Validation: {len(val_texts)}")
print(f"  Test: {len(test_texts)}")


## Initialize Model


In [None]:
# Initialize model and tokenizer
model_name = "facebook/bart-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading {model_name}...")
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
print(f"Model loaded on {device}")

# Print model size
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


## Prepare Datasets


In [None]:
# Create datasets
print("Creating datasets...")
train_dataset = NewsSummarizationDataset(train_texts, train_summaries, tokenizer)
val_dataset = NewsSummarizationDataset(val_texts, val_summaries, tokenizer)
print("Datasets created!")


## Fine-tuning Configuration


In [None]:
# Training arguments
output_dir = "./bart_base_finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{output_dir}/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="none",  # Disable wandb/tensorboard for Kaggle
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Mixed precision (FP16): {training_args.fp16}")
print(f"  Output directory: {output_dir}")


## Start Fine-tuning


In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Starting fine-tuning...")
print("This may take several hours depending on dataset size and GPU.")
print("-" * 80)

# Train
trainer.train()

print("\nFine-tuning completed!")


## Save Model


In [None]:
# Save the fine-tuned model
print("Saving model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# Also save to Kaggle output for download
if os.path.exists('/kaggle/working'):
    kaggle_output_dir = "/kaggle/working/bart_base_finetuned"
    trainer.save_model(kaggle_output_dir)
    tokenizer.save_pretrained(kaggle_output_dir)
    print(f"Model also saved to {kaggle_output_dir} (downloadable from Kaggle)")


## Evaluation on Test Set


In [None]:
# Reload the best model
print("Loading best checkpoint...")
model = BartForConditionalGeneration.from_pretrained(output_dir).to(device)
print("Best model loaded!")


In [None]:
# Generate summaries for test set
def generate_summary(text, max_length=128, min_length=30):
    """Generate summary for a single text"""
        # BART doesn't need prompts, use text directly
    inputs = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
    
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

print("Generating summaries for test set...")
generated_summaries = []
for text in tqdm(test_texts):
    summary = generate_summary(text)
    generated_summaries.append(summary)

print(f"Generated {len(generated_summaries)} summaries")


## Calculate Metrics


In [None]:
# Calculate ROUGE scores
print("Calculating ROUGE scores...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for gen_sum, ref_sum in zip(generated_summaries, test_summaries):
    scores = scorer.score(ref_sum, gen_sum)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

print("Calculating BERTScore...")
P, R, F1 = bert_score(generated_summaries, test_summaries, lang='en', verbose=True)

results = {
    'rouge1': {'f1': float(np.mean(rouge_scores['rouge1']))},
    'rouge2': {'f1': float(np.mean(rouge_scores['rouge2']))},
    'rougeL': {'f1': float(np.mean(rouge_scores['rougeL']))},
    'bertscore': {
        'precision': float(P.mean().item()),
        'recall': float(R.mean().item()),
        'f1': float(F1.mean().item())
    }
}

print("\n=== FINE-TUNED MODEL RESULTS ===")
print(json.dumps(results, indent=2))


## Save Results


In [None]:
# Save results
results_summary = {
    "model": "BART-Base (Fine-tuned)",
    "training_samples": len(train_texts),
    "validation_samples": len(val_texts),
    "test_samples": len(test_texts),
    "metrics": results
}

with open('bart_base_finetuned_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

# Save to Kaggle output if available
if os.path.exists('/kaggle/working'):
    with open('/kaggle/working/bart_base_finetuned_results.json', 'w') as f:
        json.dump(results_summary, f, indent=2)
    print("Results saved to /kaggle/working/bart_base_finetuned_results.json")

print("\nResults saved!")


## Example Summaries


In [None]:
# Display some example summaries
print("\n=== EXAMPLE SUMMARIES ===")
for i in range(min(5, len(test_texts))):
    print(f"\n--- Example {i+1} ---")
    print(f"\nOriginal Article (first 300 chars):\n{test_texts[i][:300]}...")
    print(f"\nReference Summary:\n{test_summaries[i]}")
    print(f"\nGenerated Summary:\n{generated_summaries[i]}")
    print("-" * 80)
