In [None]:
pip install torch transformers datasets rouge-score nltk tqdm pandas numpy sentencepiece protobuf accelerate evaluate

## Verify installation

In [None]:
import torch
import transformers
import datasets
from rouge_score import rouge_scorer
import nltk

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## View dataset

In [None]:
import pandas as pd
import numpy as np

# Load the datasets
print("=" * 80)
print("LOADING CNN/DAILYMAIL DATASET")
print("=" * 80)

# Load train, validation, and test datasets
train_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
validation_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')
test_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')

print("\n" + "=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"Train set size: {len(train_df):,} samples")
print(f"Validation set size: {len(validation_df):,} samples")
print(f"Test set size: {len(test_df):,} samples")
print(f"Total samples: {len(train_df) + len(validation_df) + len(test_df):,}")

print("\n" + "=" * 80)
print("DATASET COLUMNS")
print("=" * 80)
print(f"Column names: {list(train_df.columns)}")
print(f"Column types:\n{train_df.dtypes}")

print("\n" + "=" * 80)
print("BASIC STATISTICS")
print("=" * 80)
print(train_df.info())

print("\n" + "=" * 80)
print("MISSING VALUES CHECK")
print("=" * 80)
print(f"Train set missing values:\n{train_df.isnull().sum()}")
print(f"\nValidation set missing values:\n{validation_df.isnull().sum()}")
print(f"\nTest set missing values:\n{test_df.isnull().sum()}")

print("\n" + "=" * 80)
print("FIRST 5 ENTRIES FROM TRAINING SET")
print("=" * 80)
print(train_df.head())

print("\n" + "=" * 80)
print("DETAILED VIEW OF FIRST 5 ENTRIES")
print("=" * 80)

for idx in range(min(5, len(train_df))):
    print(f"\n{'─' * 80}")
    print(f"SAMPLE {idx + 1}")
    print(f"{'─' * 80}")
    
    row = train_df.iloc[idx]
    
    for col in train_df.columns:
        print(f"\n[{col.upper()}]:")
        content = str(row[col])
        # Truncate long text for better readability
        if len(content) > 500:
            print(content[:500] + f"... (truncated, total length: {len(content)} chars)")
        else:
            print(content)
    
    print(f"\n{'─' * 80}")

print("\n" + "=" * 80)
print("TEXT LENGTH STATISTICS")
print("=" * 80)

# Calculate text lengths
train_df['article_length'] = train_df.iloc[:, 0].astype(str).apply(len)
train_df['summary_length'] = train_df.iloc[:, 1].astype(str).apply(len) if len(train_df.columns) > 1 else 0

print("\nArticle Length Statistics:")
print(train_df['article_length'].describe())

if len(train_df.columns) > 1:
    print("\nSummary Length Statistics:")
    print(train_df['summary_length'].describe())
    
    print(f"\nAverage Compression Ratio: {train_df['article_length'].mean() / train_df['summary_length'].mean():.2f}x")

print("\n" + "=" * 80)
print("EXPLORATION COMPLETE")
print("=" * 80)

In [None]:
!pip install --upgrade transformers huggingface_hub

In [None]:
"""
KAGGLE SETUP - RUN THIS CELL FIRST
This will fix the transformers library compatibility issues
"""

print("="*80)
print("SETTING UP ENVIRONMENT FOR T5 SUMMARIZATION")
print("="*80)

print("\nUpgrading transformers library (this may take 1-2 minutes)...")

# Upgrade transformers and dependencies
!pip install --upgrade --quiet transformers==4.35.0 huggingface_hub tokenizers

print("\n✓ Libraries updated successfully!")

print("\n" + "="*80)
print("⚠️ IMPORTANT: RESTART KERNEL NOW!")
print("="*80)
print("\nSteps:")
print("1. Click 'Kernel' → 'Restart Kernel' in the top menu")
print("2. After restart, run the preprocessing script")
print("="*80)

## Test processing

In [None]:
"""
CNN/DailyMail Preprocessing Script for T5 Summarization
Step 1: Data Preprocessing - FIXED FOR KAGGLE
"""

import pandas as pd
import numpy as np
from transformers import AutoTokenizer  # Using AutoTokenizer instead of T5Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import re
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("STEP 1: PREPROCESSING SCRIPT FOR T5 SUMMARIZATION")
print("="*80)

# ============================================================================
# 1. CONFIGURATION
# ============================================================================
class Config:
    MODEL_NAME = "t5-small"  # or "t5-base" for better quality
    MAX_SOURCE_LENGTH = 512  # Maximum length of input article
    MAX_TARGET_LENGTH = 128  # Maximum length of output summary
    BATCH_SIZE = 8           # Adjust based on your GPU memory
    NUM_SAMPLES = 1000       # Use subset for faster testing (remove for full training)
    
config = Config()

print(f"\nConfiguration:")
print(f"  Model: {config.MODEL_NAME}")
print(f"  Max source length: {config.MAX_SOURCE_LENGTH}")
print(f"  Max target length: {config.MAX_TARGET_LENGTH}")
print(f"  Batch size: {config.BATCH_SIZE}")

# ============================================================================
# 2. LOAD TOKENIZER - FIXED VERSION
# ============================================================================
print(f"\n{'='*80}")
print("Loading T5 Tokenizer...")
print(f"{'='*80}")

# Use AutoTokenizer which avoids the chat template issue
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
print(f"✓ Tokenizer loaded successfully")
print(f"  Vocabulary size: {tokenizer.vocab_size}")

# ============================================================================
# 3. TEXT CLEANING FUNCTIONS
# ============================================================================
def clean_text(text):
    """Clean and normalize text"""
    # Convert to string and strip whitespace
    text = str(text).strip()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters that might cause issues
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    
    return text

def preprocess_text(article, summary):
    """Preprocess article and summary"""
    # Clean texts
    article = clean_text(article)
    summary = clean_text(summary)
    
    # T5 requires "task prefix" before input
    # For summarization, we add "summarize: " prefix
    article = "summarize: " + article
    
    return article, summary

# ============================================================================
# 4. LOAD DATASETS
# ============================================================================
print(f"\n{'='*80}")
print("Loading CNN/DailyMail Datasets...")
print(f"{'='*80}")

# Load datasets
train_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
val_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')
test_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')

print(f"✓ Original sizes:")
print(f"  Train: {len(train_df):,}")
print(f"  Validation: {len(val_df):,}")
print(f"  Test: {len(test_df):,}")

# Use subset for testing (remove these lines for full training)
if config.NUM_SAMPLES:
    train_df = train_df.head(config.NUM_SAMPLES)
    val_df = val_df.head(config.NUM_SAMPLES // 10)
    test_df = test_df.head(config.NUM_SAMPLES // 10)
    print(f"\n⚠ Using subset for testing:")
    print(f"  Train: {len(train_df):,}")
    print(f"  Validation: {len(val_df):,}")
    print(f"  Test: {len(test_df):,}")

# ============================================================================
# 5. CUSTOM DATASET CLASS
# ============================================================================
class SummarizationDataset(Dataset):
    """Custom Dataset for T5 Summarization"""
    
    def __init__(self, dataframe, tokenizer, max_source_length, max_target_length):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Get article and summary
        article = row['article']
        summary = row['highlights']
        
        # Preprocess
        article, summary = preprocess_text(article, summary)
        
        # Tokenize source (article)
        source_encoding = self.tokenizer(
            article,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize target (summary)
        target_encoding = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Prepare labels (replace padding token id with -100)
        labels = target_encoding['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

# ============================================================================
# 6. CREATE DATASETS
# ============================================================================
print(f"\n{'='*80}")
print("Creating PyTorch Datasets...")
print(f"{'='*80}")

train_dataset = SummarizationDataset(
    train_df, 
    tokenizer, 
    config.MAX_SOURCE_LENGTH, 
    config.MAX_TARGET_LENGTH
)

val_dataset = SummarizationDataset(
    val_df, 
    tokenizer, 
    config.MAX_SOURCE_LENGTH, 
    config.MAX_TARGET_LENGTH
)

test_dataset = SummarizationDataset(
    test_df, 
    tokenizer, 
    config.MAX_SOURCE_LENGTH, 
    config.MAX_TARGET_LENGTH
)

print(f"✓ Datasets created:")
print(f"  Train dataset size: {len(train_dataset):,}")
print(f"  Validation dataset size: {len(val_dataset):,}")
print(f"  Test dataset size: {len(test_dataset):,}")

# ============================================================================
# 7. CREATE DATALOADERS
# ============================================================================
print(f"\n{'='*80}")
print("Creating DataLoaders...")
print(f"{'='*80}")

train_loader = DataLoader(
    train_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

print(f"✓ DataLoaders created:")
print(f"  Train batches: {len(train_loader):,}")
print(f"  Validation batches: {len(val_loader):,}")
print(f"  Test batches: {len(test_loader):,}")

# ============================================================================
# 8. VERIFY PREPROCESSING
# ============================================================================
print(f"\n{'='*80}")
print("Verifying Preprocessing - Sample Batch")
print(f"{'='*80}")

# Get one batch
sample_batch = next(iter(train_loader))

print(f"\nBatch shapes:")
print(f"  Input IDs: {sample_batch['input_ids'].shape}")
print(f"  Attention Mask: {sample_batch['attention_mask'].shape}")
print(f"  Labels: {sample_batch['labels'].shape}")

print(f"\nDecoding first sample:")
print(f"\n{'─'*80}")
print("INPUT (Article):")
print(f"{'─'*80}")
decoded_input = tokenizer.decode(sample_batch['input_ids'][0], skip_special_tokens=True)
print(decoded_input[:500] + "..." if len(decoded_input) > 500 else decoded_input)

print(f"\n{'─'*80}")
print("TARGET (Summary):")
print(f"{'─'*80}")
# Replace -100 with pad_token_id for decoding
labels_for_decode = sample_batch['labels'][0].clone()
labels_for_decode[labels_for_decode == -100] = tokenizer.pad_token_id
decoded_target = tokenizer.decode(labels_for_decode, skip_special_tokens=True)
print(decoded_target)

# ============================================================================
# 9. PREPROCESSING STATISTICS
# ============================================================================
print(f"\n{'='*80}")
print("Preprocessing Statistics")
print(f"{'='*80}")

# Calculate actual lengths
sample_article = train_df.iloc[0]['article']
sample_summary = train_df.iloc[0]['highlights']

preprocessed_article, preprocessed_summary = preprocess_text(sample_article, sample_summary)
tokenized_article = tokenizer(preprocessed_article, truncation=False)
tokenized_summary = tokenizer(preprocessed_summary, truncation=False)

print(f"\nSample text lengths:")
print(f"  Original article chars: {len(sample_article):,}")
print(f"  Preprocessed article tokens: {len(tokenized_article['input_ids'])}")
print(f"  Original summary chars: {len(sample_summary):,}")
print(f"  Preprocessed summary tokens: {len(tokenized_summary['input_ids'])}")
print(f"  Will truncate articles longer than: {config.MAX_SOURCE_LENGTH} tokens")
print(f"  Will truncate summaries longer than: {config.MAX_TARGET_LENGTH} tokens")

print(f"\n{'='*80}")
print("✓ PREPROCESSING COMPLETE!")
print(f"{'='*80}")
print("\nData is ready for fine-tuning!")
print("Next step: Fine-tuning the T5 model")

## T5 Finetunning

In [None]:
"""
CNN/DailyMail Fine-tuning Script for T5 Summarization
Step 2: Model Fine-tuning
"""

import torch
from transformers import T5ForConditionalGeneration, get_linear_schedule_with_warmup
from torch.optim import AdamW  # Import AdamW from PyTorch instead
from tqdm.auto import tqdm
import numpy as np
import time
import os

print("="*80)
print("STEP 2: FINE-TUNING T5 MODEL")
print("="*80)

# ============================================================================
# 1. TRAINING CONFIGURATION
# ============================================================================
class TrainingConfig:
    # Model
    MODEL_NAME = "t5-small"
    
    # Training parameters
    EPOCHS = 3                    # Number of training epochs
    LEARNING_RATE = 3e-4          # Learning rate
    WARMUP_STEPS = 500            # Warmup steps for scheduler
    GRADIENT_ACCUMULATION = 4     # Accumulate gradients (effective batch = 8*4=32)
    MAX_GRAD_NORM = 1.0           # Gradient clipping
    
    # Paths
    OUTPUT_DIR = "/kaggle/working/t5_summarization"
    CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, "checkpoints")
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
training_config = TrainingConfig()

# Create directories
os.makedirs(training_config.OUTPUT_DIR, exist_ok=True)
os.makedirs(training_config.CHECKPOINT_DIR, exist_ok=True)

print(f"\nTraining Configuration:")
print(f"  Model: {training_config.MODEL_NAME}")
print(f"  Device: {training_config.DEVICE}")
print(f"  Epochs: {training_config.EPOCHS}")
print(f"  Learning Rate: {training_config.LEARNING_RATE}")
print(f"  Gradient Accumulation Steps: {training_config.GRADIENT_ACCUMULATION}")
print(f"  Output Directory: {training_config.OUTPUT_DIR}")

# Check CUDA
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("  ⚠ Running on CPU - Training will be slower!")

# ============================================================================
# 2. LOAD PRE-TRAINED MODEL
# ============================================================================
print(f"\n{'='*80}")
print("Loading Pre-trained T5 Model...")
print(f"{'='*80}")

model = T5ForConditionalGeneration.from_pretrained(training_config.MODEL_NAME)
model.to(training_config.DEVICE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"✓ Model loaded successfully")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")

# ============================================================================
# 3. SETUP OPTIMIZER AND SCHEDULER
# ============================================================================
print(f"\n{'='*80}")
print("Setting up Optimizer and Scheduler...")
print(f"{'='*80}")

# Optimizer
optimizer = AdamW(
    model.parameters(),
    lr=training_config.LEARNING_RATE,
    eps=1e-8
)

# Calculate total training steps
total_steps = len(train_loader) * training_config.EPOCHS // training_config.GRADIENT_ACCUMULATION

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_config.WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"✓ Optimizer and scheduler configured")
print(f"  Total training steps: {total_steps:,}")
print(f"  Warmup steps: {training_config.WARMUP_STEPS}")

# ============================================================================
# 4. TRAINING FUNCTIONS
# ============================================================================

def train_epoch(model, dataloader, optimizer, scheduler, device, epoch, gradient_accumulation):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch}")
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss / gradient_accumulation
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights every gradient_accumulation steps
        if (step + 1) % gradient_accumulation == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), training_config.MAX_GRAD_NORM)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item() * gradient_accumulation:.4f}',
            'lr': f'{scheduler.get_last_lr()[0]:.2e}'
        })
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def validate(model, dataloader, device):
    """Validate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# ============================================================================
# 5. TRAINING LOOP
# ============================================================================
print(f"\n{'='*80}")
print("Starting Training...")
print(f"{'='*80}")

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'epochs': []
}

best_val_loss = float('inf')
start_time = time.time()

for epoch in range(1, training_config.EPOCHS + 1):
    print(f"\n{'─'*80}")
    print(f"Epoch {epoch}/{training_config.EPOCHS}")
    print(f"{'─'*80}")
    
    # Train
    train_loss = train_epoch(
        model, 
        train_loader, 
        optimizer, 
        scheduler, 
        training_config.DEVICE,
        epoch,
        training_config.GRADIENT_ACCUMULATION
    )
    
    # Validate
    val_loss = validate(model, val_loader, training_config.DEVICE)
    
    # Update history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['epochs'].append(epoch)
    
    # Print epoch summary
    print(f"\nEpoch {epoch} Summary:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = os.path.join(training_config.OUTPUT_DIR, "best_model")
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        print(f"  ✓ Best model saved! (Val Loss: {val_loss:.4f})")
    
    # Save checkpoint
    checkpoint_path = os.path.join(training_config.CHECKPOINT_DIR, f"checkpoint_epoch_{epoch}")
    model.save_pretrained(checkpoint_path)
    tokenizer.save_pretrained(checkpoint_path)
    print(f"  ✓ Checkpoint saved: {checkpoint_path}")

# ============================================================================
# 6. TRAINING COMPLETE
# ============================================================================
end_time = time.time()
training_time = end_time - start_time

print(f"\n{'='*80}")
print("✓ TRAINING COMPLETE!")
print(f"{'='*80}")
print(f"\nTraining Summary:")
print(f"  Total time: {training_time/60:.2f} minutes ({training_time/3600:.2f} hours)")
print(f"  Best validation loss: {best_val_loss:.4f}")
print(f"  Final train loss: {history['train_loss'][-1]:.4f}")
print(f"  Final val loss: {history['val_loss'][-1]:.4f}")

# ============================================================================
# 7. PLOT TRAINING CURVES
# ============================================================================
print(f"\n{'='*80}")
print("Training Loss Curves")
print(f"{'='*80}")

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(history['epochs'], history['train_loss'], label='Train Loss', marker='o')
plt.plot(history['epochs'], history['val_loss'], label='Validation Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('T5 Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(training_config.OUTPUT_DIR, 'training_curves.png'), dpi=300)
print("✓ Training curves saved!")
plt.show()

# ============================================================================
# 8. SAVE TRAINING HISTORY
# ============================================================================
import json

history_file = os.path.join(training_config.OUTPUT_DIR, 'training_history.json')
with open(history_file, 'w') as f:
    json.dump(history, f, indent=4)
print(f"✓ Training history saved: {history_file}")

print(f"\n{'='*80}")
print("Model saved at:")
print(f"  Best Model: {os.path.join(training_config.OUTPUT_DIR, 'best_model')}")
print(f"  Checkpoints: {training_config.CHECKPOINT_DIR}")
print(f"{'='*80}")
print("\nNext step: Evaluation using ROUGE metrics")