# Fine-Tuning BART for Abstractive Summarization on Kaggle

This notebook fine-tunes `facebook/bart-large-cnn` on the processed CNN/DailyMail dataset.
It is configured to run on Kaggle with a T4 GPU.

In [None]:
# 1. Install Missing Dependencies Only
# We avoid re-installing transformers/torch to prevent breaking Kaggle's pre-installed environment
!pip install -q datasets evaluate rouge_score accelerate

In [None]:
# 2. Imports and Setup
import os
import numpy as np
import torch
import evaluate
import nltk
from pathlib import Path
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

# Download NLTK data for ROUGE
nltk.download('punkt', quiet=True)

print("Libraries installed and imported.")

In [None]:
# 3. Configuration
# User specified input path on Kaggle
PROCESSED_DATA_PATH = Path("/kaggle/input/cnn-dailymail-processed/cnn_dailymail_processed")
MODEL_CHECKPOINT = "facebook/bart-large-cnn"
OUTPUT_DIR = Path("/kaggle/working/models/checkpoints")
FINAL_MODEL_DIR = Path("/kaggle/working/models/final_model_full")

print(f"Data Path: {PROCESSED_DATA_PATH}")
print(f"Model Output: {FINAL_MODEL_DIR}")

In [None]:
# 4. Define Metrics (ROUGE)
metric = evaluate.load("rouge")

def compute_metrics(eval_pred, tokenizer):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

In [None]:
# 5. Training Function
def train():
    if not PROCESSED_DATA_PATH.exists():
        raise FileNotFoundError(f"Dataset not found at {PROCESSED_DATA_PATH}. Please check the input path.")

    print(f"Loading data from {PROCESSED_DATA_PATH}...")
    tokenized_datasets = load_from_disk(str(PROCESSED_DATA_PATH))
    
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    # --- CRITICAL OPTIMIZATION ---
    # Defaulting to 25% of data to ensure completion within Kaggle 12h limit
    # 25% of ~287k samples = ~71k samples (Approx 5-6 hours on T4)
    print("Subsampling dataset to 25% to fit in time limit...")
    tokenized_datasets["train"] = tokenized_datasets["train"].shard(num_shards=4, index=0)
    print(f"New Training Samples: {len(tokenized_datasets['train'])}")
    # -----------------------------

    # TRAINING ARGUMENTS
    # Configured for T4 GPU (16GB VRAM)
    training_args = Seq2SeqTrainingArguments(
        output_dir=str(OUTPUT_DIR),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,  # Effective batch size = 32
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=1,  # Set to 1 safe run, increase if you have quota
        predict_with_generate=True,
        fp16=True, 
        logging_steps=50,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer),
    )

    print("Starting training...")
    trainer.train()

    print(f"Saving final model to {FINAL_MODEL_DIR}...")
    trainer.save_model(str(FINAL_MODEL_DIR))
    tokenizer.save_pretrained(str(FINAL_MODEL_DIR))
    print("Model saved successfully.")

# Run Training
if __name__ == "__main__":
    train()

In [None]:
# 6. Zip Output for Download
!zip -r /kaggle/working/final_model.zip /kaggle/working/models/final_model_full
print("\nDone! You can now download 'final_model.zip' from the Output tab.")