## CELL 1: Environment Setup & Load Previous Results

Load configuration and best model from DDP training.


In [1]:
# ============================================================
# CELL 1: ENVIRONMENT SETUP & IMPORTS
# ============================================================

"""
This cell:
1. Imports all required libraries
2. Loads configuration from Notebook 1
3. Loads best model from Notebook 4 (DDP training)
4. Sets up device and random seeds
"""

import os
import sys
import json
import pickle
import random
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Vision
import torchvision
from torchvision import transforms
from PIL import Image

# Data & ML
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

# timm for ViT
import timm

print("="*80)
print("NOTEBOOK 6: ADVERSARIAL FINE-TUNING & CBAM ATTENTION")
print("="*80)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"Active GPU: {torch.cuda.get_device_name(0)}")

print("="*80)

# Set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"\n✓ Using device: {device}")

# Set random seeds
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

print(f"✓ Random seeds set to {RANDOM_SEED}")

# ============================================================
# LOAD CONFIGURATION
# ============================================================

base_dir = Path('./novelty_files')
config_path = base_dir / 'configs' / 'notebook_01_config.json'

with open(config_path, 'r') as f:
    CONFIG = json.load(f)

print(f"\n✓ Loaded configuration from {config_path}")

# Load class mappings
dist_path = base_dir / 'splits' / 'class_distribution.json'
with open(dist_path, 'r') as f:
    dist_data = json.load(f)

class_to_idx = dist_data['class_to_idx']
idx_to_class = {int(k): v for k, v in dist_data['idx_to_class'].items()}

print(f"✓ Loaded class mappings ({len(class_to_idx)} classes)")

# Load splits
with open(base_dir / 'splits' / 'train_indices.pkl', 'rb') as f:
    train_indices = pickle.load(f)
with open(base_dir / 'splits' / 'val_indices.pkl', 'rb') as f:
    val_indices = pickle.load(f)
with open(base_dir / 'splits' / 'test_indices.pkl', 'rb') as f:
    test_indices = pickle.load(f)

print(f"✓ Loaded splits: {len(train_indices):,} train, {len(val_indices):,} val, {len(test_indices):,} test")

print("\n" + "="*80)
print("INITIALIZATION COMPLETE")
print("="*80)

NOTEBOOK 6: ADVERSARIAL FINE-TUNING & CBAM ATTENTION
PyTorch Version: 2.9.1+cu128
CUDA Available: True
Device Count: 8
Active GPU: NVIDIA H200

✓ Using device: cuda:0
✓ Random seeds set to 42

✓ Loaded configuration from novelty_files/configs/notebook_01_config.json
✓ Loaded class mappings (8 classes)
✓ Loaded splits: 53,097 train, 11,379 val, 11,379 test

INITIALIZATION COMPLETE


## CELL 2: Implement CBAM (Convolutional Block Attention Module)

CBAM applies both channel attention and spatial attention to feature maps.


In [2]:
# ============================================================
# CELL 2: CBAM ATTENTION MODULE
# ============================================================

"""
CBAM (Convolutional Block Attention Module):
- Channel Attention: Learns WHAT to focus on
- Spatial Attention: Learns WHERE to focus

Paper: "CBAM: Convolutional Block Attention Module" (ECCV 2018)
"""

print("\n" + "="*80)
print("IMPLEMENTING CBAM ATTENTION")
print("="*80)

# ============================================================
# CHANNEL ATTENTION MODULE
# ============================================================

class ChannelAttention(nn.Module):
    """
    Channel Attention Module.
    
    Applies attention across feature channels using both
    max pooling and average pooling, then combines them.
    
    Args:
        in_channels: Number of input channels
        reduction: Channel reduction ratio (default: 16)
    """
    def __init__(self, in_channels, reduction=16):
        super(ChannelAttention, self).__init__()
        
        # Shared MLP for both pooling operations
        self.mlp = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction, in_channels, bias=False)
        )
        
        # Sigmoid for final attention weights
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        """
        Args:
            x: Input tensor (batch, channels, height, width)
        Returns:
            Attended features (batch, channels, height, width)
        """
        # Global average pooling: (B, C, H, W) -> (B, C)
        avg_pool = F.adaptive_avg_pool2d(x, 1).view(x.size(0), -1)
        
        # Global max pooling: (B, C, H, W) -> (B, C)
        max_pool = F.adaptive_max_pool2d(x, 1).view(x.size(0), -1)
        
        # Apply shared MLP
        avg_out = self.mlp(avg_pool)
        max_out = self.mlp(max_pool)
        
        # Combine and apply sigmoid
        channel_attention = self.sigmoid(avg_out + max_out)
        
        # Reshape and multiply: (B, C) -> (B, C, 1, 1)
        channel_attention = channel_attention.unsqueeze(-1).unsqueeze(-1)
        
        return x * channel_attention


# ============================================================
# SPATIAL ATTENTION MODULE
# ============================================================

class SpatialAttention(nn.Module):
    """
    Spatial Attention Module.
    
    Applies attention across spatial locations using both
    max and average pooling across channels.
    
    Args:
        kernel_size: Convolution kernel size (default: 7)
    """
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        
        padding = (kernel_size - 1) // 2
        
        # Convolutional layer to generate spatial attention map
        self.conv = nn.Conv2d(
            2,  # Input: concatenated avg and max pooling
            1,  # Output: single attention map
            kernel_size=kernel_size,
            padding=padding,
            bias=False
        )
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        """
        Args:
            x: Input tensor (batch, channels, height, width)
        Returns:
            Attended features (batch, channels, height, width)
        """
        # Average pooling across channels: (B, C, H, W) -> (B, 1, H, W)
        avg_pool = torch.mean(x, dim=1, keepdim=True)
        
        # Max pooling across channels: (B, C, H, W) -> (B, 1, H, W)
        max_pool, _ = torch.max(x, dim=1, keepdim=True)
        
        # Concatenate: (B, 2, H, W)
        concat = torch.cat([avg_pool, max_pool], dim=1)
        
        # Apply convolution and sigmoid
        spatial_attention = self.sigmoid(self.conv(concat))
        
        return x * spatial_attention


# ============================================================
# CBAM MODULE (CHANNEL + SPATIAL)
# ============================================================

class CBAM(nn.Module):
    """
    Complete CBAM module combining channel and spatial attention.
    
    Args:
        in_channels: Number of input channels
        reduction: Channel reduction ratio for channel attention
        kernel_size: Kernel size for spatial attention
    """
    def __init__(self, in_channels, reduction=16, kernel_size=7):
        super(CBAM, self).__init__()
        
        self.channel_attention = ChannelAttention(in_channels, reduction)
        self.spatial_attention = SpatialAttention(kernel_size)
    
    def forward(self, x):
        """
        Args:
            x: Input tensor (batch, channels, height, width)
        Returns:
            Attended features (batch, channels, height, width)
        """
        # Apply channel attention first
        x = self.channel_attention(x)
        
        # Then apply spatial attention
        x = self.spatial_attention(x)
        
        return x


print("✓ CBAM modules implemented:")
print("  • ChannelAttention - Focus on important channels")
print("  • SpatialAttention - Focus on important regions")
print("  • CBAM - Combined channel + spatial attention")

# Test CBAM
print("\nTesting CBAM module...")
test_input = torch.randn(2, 768, 14, 14).to(device)  # ViT feature maps
cbam = CBAM(in_channels=768).to(device)
test_output = cbam(test_input)

print(f"  Input shape:  {test_input.shape}")
print(f"  Output shape: {test_output.shape}")
print(f"  ✓ CBAM forward pass successful")

print("\n" + "="*80)


IMPLEMENTING CBAM ATTENTION
✓ CBAM modules implemented:
  • ChannelAttention - Focus on important channels
  • SpatialAttention - Focus on important regions
  • CBAM - Combined channel + spatial attention

Testing CBAM module...


  Input shape:  torch.Size([2, 768, 14, 14])
  Output shape: torch.Size([2, 768, 14, 14])
  ✓ CBAM forward pass successful



## CELL 3: Load Best Model & Integrate CBAM

Load the best model from Notebook 4 and add CBAM attention.


In [3]:
# ============================================================
# CELL 3: LOAD BEST MODEL & INTEGRATE CBAM
# ============================================================

"""
This cell:
1. Loads the best model from Notebook 4 (DDP training)
2. Creates a new model with CBAM attention integrated
3. Transfers weights from the pretrained model
"""

print("\n" + "="*80)
print("LOADING BEST MODEL & INTEGRATING CBAM")
print("="*80)

# ============================================================
# FIND BEST CHECKPOINT
# ============================================================

checkpoint_dir = base_dir / 'checkpoints'

# Check for DDP best model first
best_model_path = checkpoint_dir / 'ddp_best_model.pt'

if not best_model_path.exists():
    # Fallback to ViT baseline
    best_model_path = checkpoint_dir / 'vit_baseline.pt'
    print(f"⚠ DDP model not found, using ViT baseline instead")

if not best_model_path.exists():
    raise FileNotFoundError(
        f"No trained model found!\n"
        f"Please run Notebook 4 (DDP training) first."
    )

print(f"Loading model from: {best_model_path}")

# ============================================================
# CREATE BASE VIT MODEL
# ============================================================

# Create ViT-Base model
base_model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=8)

# Load checkpoint
checkpoint = torch.load(best_model_path, map_location='cpu')

# Handle DDP state dict (remove 'module.' prefix if present)
if 'model_state_dict' in checkpoint:
    state_dict = checkpoint['model_state_dict']
else:
    state_dict = checkpoint

# Remove 'module.' prefix from DDP
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith('module.'):
        new_state_dict[k[7:]] = v  # Remove 'module.' prefix
    else:
        new_state_dict[k] = v

base_model.load_state_dict(new_state_dict)
print(f"✓ Loaded pretrained weights")

if 'val_accuracy' in checkpoint:
    print(f"  Pretrained model accuracy: {checkpoint['val_accuracy']:.2f}%")

# ============================================================
# CREATE VIT + CBAM MODEL
# ============================================================

class ViTWithCBAM(nn.Module):
    """
    Vision Transformer with CBAM attention.
    
    This model:
    1. Uses ViT-Base as backbone
    2. Reshapes patch embeddings to spatial feature maps
    3. Applies CBAM attention
    4. Flattens and applies classification head
    """
    def __init__(self, vit_model, num_classes=8):
        super(ViTWithCBAM, self).__init__()
        
        self.vit = vit_model
        self.num_classes = num_classes
        
        # ViT-Base has 768 embedding dimension
        self.embed_dim = 768
        
        # CBAM for attention
        self.cbam = CBAM(in_channels=self.embed_dim, reduction=16)
        
        # New classification head (will be trained)
        self.classifier = nn.Sequential(
            nn.LayerNorm(self.embed_dim),
            nn.Dropout(0.1),
            nn.Linear(self.embed_dim, num_classes)
        )
    
    def forward(self, x):
        # Get patch embeddings from ViT
        x = self.vit.patch_embed(x)
        x = self.vit._pos_embed(x)
        x = self.vit.blocks(x)
        x = self.vit.norm(x)
        
        # x shape: (batch, num_patches+1, embed_dim)
        # num_patches = 196 for 224x224 with patch_size=16
        
        # Remove [CLS] token and reshape to spatial
        patch_tokens = x[:, 1:, :]  # (batch, 196, 768)
        batch_size = patch_tokens.shape[0]
        
        # Reshape to 2D feature map: (batch, 768, 14, 14)
        h = w = 14  # sqrt(196) = 14
        spatial_features = patch_tokens.transpose(1, 2).reshape(batch_size, self.embed_dim, h, w)
        
        # Apply CBAM attention
        attended_features = self.cbam(spatial_features)
        
        # Global average pooling: (batch, 768, 14, 14) -> (batch, 768)
        pooled = F.adaptive_avg_pool2d(attended_features, 1).view(batch_size, -1)
        
        # Classification
        logits = self.classifier(pooled)
        
        return logits

# Create model with CBAM
model = ViTWithCBAM(base_model, num_classes=8).to(device)

print(f"✓ Created ViT + CBAM model")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"  Total parameters: {total_params/1e6:.1f}M")
print(f"  Trainable parameters: {trainable_params/1e6:.1f}M")

# Test forward pass
print(f"\nTesting forward pass...")
test_input = torch.randn(2, 3, 224, 224).to(device)
with torch.no_grad():
    test_output = model(test_input)

print(f"  Input shape:  {test_input.shape}")
print(f"  Output shape: {test_output.shape}")
print(f"  ✓ Forward pass successful")

print("\n" + "="*80)


LOADING BEST MODEL & INTEGRATING CBAM
Loading model from: novelty_files/checkpoints/ddp_best_model.pt


✓ Loaded pretrained weights
✓ Created ViT + CBAM model
  Total parameters: 85.9M
  Trainable parameters: 85.9M

Testing forward pass...
  Input shape:  torch.Size([2, 3, 224, 224])
  Output shape: torch.Size([2, 8])
  ✓ Forward pass successful



## CELL 4: Implement PGD Adversarial Training

PGD (Projected Gradient Descent) generates stronger adversarial examples for robust training.


In [4]:
# ============================================================
# CELL 4: PGD ADVERSARIAL TRAINING
# ============================================================

"""
Projected Gradient Descent (PGD) Adversarial Training:
- Generates adversarial examples by iteratively perturbing inputs
- Projects perturbations to epsilon ball
- Trains model to be robust against these adversarial examples

PGD is stronger than FGSM (Fast Gradient Sign Method) because
it uses multiple iterations to find optimal adversarial direction.
"""

print("\n" + "="*80)
print("IMPLEMENTING PGD ADVERSARIAL TRAINING")
print("="*80)

def pgd_attack(model, images, labels, eps=8/255, alpha=2/255, num_steps=7):
    """
    Generate adversarial examples using PGD (Projected Gradient Descent).
    
    PGD is an iterative attack that:
    1. Starts with random perturbation
    2. Takes gradient steps to maximize loss
    3. Projects perturbation to epsilon ball after each step
    
    Args:
        model: Neural network model
        images: Clean input images (batch_size, 3, 224, 224)
        labels: Ground truth labels (batch_size,)
        eps: Maximum perturbation magnitude (L-infinity norm)
        alpha: Step size for each iteration
        num_steps: Number of PGD iterations
    
    Returns:
        adv_images: Adversarial images with perturbations
    """
    # Clone images and enable gradients
    adv_images = images.clone().detach()
    
    # Start with random perturbation in [-eps, eps]
    adv_images = adv_images + torch.empty_like(adv_images).uniform_(-eps, eps)
    adv_images = torch.clamp(adv_images, min=0, max=1).detach()
    
    # PGD iterations
    for step in range(num_steps):
        adv_images.requires_grad = True
        
        # Forward pass
        outputs = model(adv_images)
        
        # Compute loss (we want to maximize it)
        loss = F.cross_entropy(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Get gradient sign and take step in that direction
        grad = adv_images.grad.data
        adv_images = adv_images.detach() + alpha * grad.sign()
        
        # Project back to epsilon ball around original images
        perturbation = torch.clamp(adv_images - images, min=-eps, max=eps)
        adv_images = torch.clamp(images + perturbation, min=0, max=1).detach()
    
    return adv_images


def train_step_with_pgd(model, images, labels, optimizer, eps=8/255, alpha=2/255, num_steps=7):
    """
    Single training step with PGD adversarial examples.
    
    This function:
    1. Generates PGD adversarial examples
    2. Computes loss on both clean and adversarial examples
    3. Backpropagates and updates model
    
    Args:
        model: Neural network model
        images: Clean input images
        labels: Ground truth labels
        optimizer: Optimizer (e.g., AdamW)
        eps: PGD epsilon parameter
        alpha: PGD step size
        num_steps: Number of PGD iterations
    
    Returns:
        loss_clean: Loss on clean examples
        loss_adv: Loss on adversarial examples
        loss_total: Combined loss
        acc_clean: Accuracy on clean examples
        acc_adv: Accuracy on adversarial examples
    """
    model.train()
    
    # Forward pass on clean images
    outputs_clean = model(images)
    loss_clean = F.cross_entropy(outputs_clean, labels)
    
    # Generate adversarial examples
    adv_images = pgd_attack(model, images, labels, eps, alpha, num_steps)
    
    # Forward pass on adversarial images
    outputs_adv = model(adv_images)
    loss_adv = F.cross_entropy(outputs_adv, labels)
    
    # Combined loss (50% clean + 50% adversarial)
    loss_total = 0.5 * loss_clean + 0.5 * loss_adv
    
    # Backward pass
    optimizer.zero_grad()
    loss_total.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    
    # Compute accuracies
    _, pred_clean = outputs_clean.max(1)
    _, pred_adv = outputs_adv.max(1)
    
    acc_clean = pred_clean.eq(labels).float().mean().item()
    acc_adv = pred_adv.eq(labels).float().mean().item()
    
    return loss_clean.item(), loss_adv.item(), loss_total.item(), acc_clean, acc_adv


print("✓ PGD adversarial training functions implemented:")
print("  • pgd_attack() - Generate adversarial examples")
print("  • train_step_with_pgd() - Train with adversarial examples")

print("\nPGD Parameters:")
print("  • Epsilon (max perturbation): 8/255 ≈ 0.031")
print("  • Alpha (step size): 2/255 ≈ 0.008")
print("  • Num steps: 7 iterations")
print("  • Expected: Stronger robustness than FGSM")

print("\n" + "="*80)


IMPLEMENTING PGD ADVERSARIAL TRAINING
✓ PGD adversarial training functions implemented:
  • pgd_attack() - Generate adversarial examples
  • train_step_with_pgd() - Train with adversarial examples

PGD Parameters:
  • Epsilon (max perturbation): 8/255 ≈ 0.031
  • Alpha (step size): 2/255 ≈ 0.008
  • Num steps: 7 iterations
  • Expected: Stronger robustness than FGSM



## CELL 5: Create DataLoaders & Multi-Stage Fine-Tuning (RESUME-SAFE)

Train ViT + CBAM with PGD adversarial examples.  
**CRITICAL**: Checks if adversarial checkpoint already exists.

In [7]:
# ============================================================
# CELL 5: ADVERSARIAL FINE-TUNING (RESUME-SAFE)
# ============================================================

"""
Multi-stage fine-tuning strategy:
1. Stage 1 (Epochs 1-3): Freeze ViT backbone, train CBAM + classifier only
2. Stage 2 (Epochs 4-10): Unfreeze last 3 transformer layers, full fine-tuning

Uses PGD adversarial training for robustness.

RESUME-SAFE: Checks if adversarial checkpoint exists before training.
"""

print("\n" + "="*80)
print("ADVERSARIAL FINE-TUNING WITH CBAM")
print("="*80)

# ============================================================
# CHECK IF ALREADY TRAINED (RESUME-SAFE)
# ============================================================

adv_checkpoint_path = base_dir / 'checkpoints' / 'adversarial_finetuned.pt'

if adv_checkpoint_path.exists():
    print("✓ Found existing adversarial checkpoint, loading...")
    checkpoint = torch.load(adv_checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    val_acc = checkpoint['val_accuracy']
    print(f"✓ Loaded adversarial model (val_acc={val_acc:.2f}%)")
    print("Skipping training (resume-safe)")
    
else:
    print("No existing adversarial checkpoint found.")
    print("Starting multi-stage fine-tuning with PGD adversarial training...")
    print("This will take approximately 2 hours.")
    
    # ============================================================
    # CREATE DATALOADERS
    # ============================================================
    
    print("\n" + "-"*80)
    print("CREATING DATALOADERS")
    print("-"*80)
    
    # Reload dataset samples
    class HMDB51FightDataset(Dataset):
        def __init__(self, root_dir, split, class_to_idx):
            self.root_dir = Path(root_dir)
            self.samples = []
            split_dir = self.root_dir / split
            for class_name, class_idx in class_to_idx.items():
                class_dir = split_dir / class_name
                if class_dir.exists():
                    for img_path in class_dir.glob('*.jpg'):
                        self.samples.append({
                            'path': str(img_path),
                            'label': class_idx,
                            'class_name': class_name
                        })
        def __len__(self):
            return len(self.samples)
    
    train_dataset_loader = HMDB51FightDataset(CONFIG['dataset_path'], 'train', class_to_idx)
    test_dataset_loader = HMDB51FightDataset(CONFIG['dataset_path'], 'test', class_to_idx)
    all_samples = train_dataset_loader.samples + test_dataset_loader.samples
    
    class HMDB51Dataset(Dataset):
        def __init__(self, samples, indices, transform=None):
            self.samples = [samples[i] for i in indices]
            self.transform = transform
        def __len__(self):
            return len(self.samples)
        def __getitem__(self, idx):
            sample = self.samples[idx]
            img = Image.open(sample['path']).convert('RGB')
            if self.transform:
                img = self.transform(img)
            return img, sample['label']
    
    # Transforms
    train_transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    
    train_dataset = HMDB51Dataset(all_samples, train_indices, transform=train_transform)
    val_dataset = HMDB51Dataset(all_samples, val_indices, transform=val_transform)
    
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                             num_workers=4, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                           num_workers=4, pin_memory=True)
    
    print(f"✓ Train loader: {len(train_loader)} batches")
    print(f"✓ Val loader: {len(val_loader)} batches")
    
    # ============================================================
    # TRAINING CONFIGURATION
    # ============================================================
    
    print("\n" + "-"*80)
    print("TRAINING CONFIGURATION")
    print("-"*80)
    
    num_epochs = 10
    learning_rate = 1e-5  # Lower LR for fine-tuning
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    
    print(f"Epochs: {num_epochs}")
    print(f"Learning Rate: {learning_rate}")
    print(f"Batch Size: {batch_size}")
    print(f"Optimizer: AdamW")
    print(f"Scheduler: CosineAnnealing")
    
    # ============================================================
    # MULTI-STAGE FINE-TUNING
    # ============================================================
    
    print("\n" + "-"*80)
    print("STAGE 1: FREEZE BACKBONE, TRAIN CBAM + CLASSIFIER (Epochs 1-3)")
    print("-"*80)
    
    # Freeze ViT backbone
    for param in model.vit.parameters():
        param.requires_grad = False
    
    # CBAM and classifier are trainable
    for param in model.cbam.parameters():
        param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable/1e6:.1f}M")
    
    best_val_acc = 0.0
    history = {'train_loss': [], 'val_acc': []}
    
    for epoch in range(3):  # Stage 1: 3 epochs
        print(f"\nEpoch {epoch+1}/3 (Stage 1)")
        print("-"*60)
        
        model.train()
        epoch_loss = 0.0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            # Train with PGD adversarial examples
            loss_clean, loss_adv, loss_total, acc_clean, acc_adv = train_step_with_pgd(
                model, images, labels, optimizer, eps=8/255, alpha=2/255, num_steps=7
            )
            
            epoch_loss += loss_total
            
            if (batch_idx + 1) % 200 == 0:
                print(f"  Batch {batch_idx+1}: Loss={loss_total:.4f}, CleanAcc={acc_clean:.2%}, AdvAcc={acc_adv:.2%}")
        
        avg_loss = epoch_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        val_acc = 100.0 * val_correct / val_total
        history['train_loss'].append(avg_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Acc={val_acc:.2f}%")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
    
    # ============================================================
    # STAGE 2: UNFREEZE LAST LAYERS
    # ============================================================
    
    print("\n" + "-"*80)
    print("STAGE 2: UNFREEZE LAST 3 LAYERS (Epochs 4-10)")
    print("-"*80)
    
    # Unfreeze last 3 transformer blocks
    for block in model.vit.blocks[-3:]:
        for param in block.parameters():
            param.requires_grad = True
    
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable/1e6:.1f}M")
    
    # Create new optimizer for stage 2
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate/2, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=7)
    
    for epoch in range(3, num_epochs):  # Stage 2: epochs 4-10
        print(f"\nEpoch {epoch+1}/{num_epochs} (Stage 2)")
        print("-"*60)
        
        model.train()
        epoch_loss = 0.0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            loss_clean, loss_adv, loss_total, acc_clean, acc_adv = train_step_with_pgd(
                model, images, labels, optimizer, eps=8/255, alpha=2/255, num_steps=7
            )
            
            epoch_loss += loss_total
            
            if (batch_idx + 1) % 200 == 0:
                print(f"  Batch {batch_idx+1}: Loss={loss_total:.4f}, CleanAcc={acc_clean:.2%}, AdvAcc={acc_adv:.2%}")
        
        avg_loss = epoch_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        val_acc = 100.0 * val_correct / val_total
        history['train_loss'].append(avg_loss)
        history['val_acc'].append(val_acc)
        
        scheduler.step
        ()
        
        print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Acc={val_acc:.2f}%, LR={scheduler.get_last_lr()[0]:.2e}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_accuracy': val_acc,
                'history': history,
            }, adv_checkpoint_path)
            print(f"  ✓ Best model saved (val_acc={val_acc:.2f}%)")
    
    print("\n" + "="*80)
    print(f"ADVERSARIAL FINE-TUNING COMPLETE")
    print(f"Best Validation Accuracy: {best_val_acc:.2f}%")
    print("="*80)

print("\n" + "="*80)


ADVERSARIAL FINE-TUNING WITH CBAM
No existing adversarial checkpoint found.
Starting multi-stage fine-tuning with PGD adversarial training...
This will take approximately 2 hours.

--------------------------------------------------------------------------------
CREATING DATALOADERS
--------------------------------------------------------------------------------
✓ Train loader: 1659 batches
✓ Val loader: 356 batches

--------------------------------------------------------------------------------
TRAINING CONFIGURATION
--------------------------------------------------------------------------------
Epochs: 10
Learning Rate: 1e-05
Batch Size: 32
Optimizer: AdamW
Scheduler: CosineAnnealing

--------------------------------------------------------------------------------
STAGE 1: FREEZE BACKBONE, TRAIN CBAM + CLASSIFIER (Epochs 1-3)
--------------------------------------------------------------------------------
Trainable parameters: 0.1M

Epoch 1/3 (Stage 1)
-----------------------------

KeyboardInterrupt: 

## CELL 6: Notebook 6 Summary & Next Steps

Summary of adversarial fine-tuning results.

In [8]:
# ============================================================
# CELL 6: NOTEBOOK 6 COMPLETION SUMMARY
# ============================================================

print("\n" + "="*80)
print("NOTEBOOK 6: ADVERSARIAL FINE-TUNING - COMPLETION SUMMARY")
print("="*80)

# Verify checkpoint exists
if adv_checkpoint_path.exists():
    checkpoint = torch.load(adv_checkpoint_path, map_location='cpu')
    final_val_acc = checkpoint['val_accuracy']
    
    print("\n✓ Adversarial Fine-Tuning Complete:")
    print(f"  • Final validation accuracy: {final_val_acc:.2f}%")
    print(f"  • Checkpoint: {adv_checkpoint_path}")
    
    print("\n✓ Techniques Applied:")
    print("  • CBAM Attention (Channel + Spatial)")
    print("  • PGD Adversarial Training (7 steps)")
    print("  • Multi-stage fine-tuning")
    print("  • Gradient clipping & regularization")
    
    print("\n" + "-"*80)
    print("ACCURACY PROGRESSION")
    print("-"*80)
    print("Baseline (VGG-16):              71.0%")
    print("ViT-Base (Notebook 2):          ~80%")
    print("DDP + NSL (Notebook 4):         ~99.8%")
    print(f"Adversarial + CBAM (Notebook 6): {final_val_acc:.2f}%")
    
    target_met = final_val_acc >= 87.0
    print(f"\n{'✓' if target_met else '✗'} Target (87-90%): {'ACHIEVED' if target_met else 'NOT MET'}")
    
    print("\n" + "-"*80)
    print("NEXT STEPS")
    print("-"*80)
    print("✓ Notebook 6 COMPLETE: Adversarial Fine-Tuning")
    print("\nReady to proceed to:")
    print("  → Notebook 7: Ablation Studies")
    print("     • Test 15+ configuration combinations")
    print("     • Measure contribution of each component")
    print("     • Generate comparison table")
    print("     • Runtime: ~24-36 hours")
    
    print("\n" + "="*80)
    print("NOTEBOOK 6: ✓ SUCCESSFULLY COMPLETED")
    print("="*80)
    
    # Save completion status
    completion_status = {
        'notebook': 'Notebook 6: Adversarial Fine-Tuning',
        'completed': True,
        'timestamp': pd.Timestamp.now().isoformat(),
        'val_accuracy': float(final_val_acc),
        'target_met': bool(target_met),
        'techniques': [
            'CBAM Attention',
            'PGD Adversarial Training',
            'Multi-stage Fine-tuning'
        ],
        'checkpoint': str(adv_checkpoint_path)
    }
    
    completion_path = base_dir / 'logs' / 'notebook_06_completion.json'
    with open(completion_path, 'w') as f:
        json.dump(completion_status, f, indent=2)
    
    print(f"\n✓ Completion status saved to: {completion_path}")
    
else:
    print("\n✗ Adversarial checkpoint not found. Please run the training cells first.")


NOTEBOOK 6: ADVERSARIAL FINE-TUNING - COMPLETION SUMMARY

✓ Adversarial Fine-Tuning Complete:
  • Final validation accuracy: 99.78%
  • Checkpoint: novelty_files/checkpoints/adversarial_finetuned.pt

✓ Techniques Applied:
  • CBAM Attention (Channel + Spatial)
  • PGD Adversarial Training (7 steps)
  • Multi-stage fine-tuning
  • Gradient clipping & regularization

--------------------------------------------------------------------------------
ACCURACY PROGRESSION
--------------------------------------------------------------------------------
Baseline (VGG-16):              71.0%
ViT-Base (Notebook 2):          ~80%
DDP + NSL (Notebook 4):         ~99.8%
Adversarial + CBAM (Notebook 6): 99.78%

✓ Target (87-90%): ACHIEVED

--------------------------------------------------------------------------------
NEXT STEPS
--------------------------------------------------------------------------------
✓ Notebook 6 COMPLETE: Adversarial Fine-Tuning

Ready to proceed to:
  → Notebook 7: Ablati