## CELL 1: Setup


In [None]:
# ============================================================
# CELL 1: SETUP
# ============================================================

import os
from pathlib import Path
from datetime import datetime

output_dir = Path('./novelty_implementation/ablation_scripts')
output_dir.mkdir(parents=True, exist_ok=True)

print("="*80)
print("NOTEBOOK 9B: DDP SCRIPT EXPORT")
print("="*80)
print(f"✓ Scripts will be exported to: {output_dir}")
print(f"✓ Generation timestamp: {datetime.now().isoformat()}")
print("="*80)

## CELL 2: Generate DDP Script Template Function

This function generates complete DDP training scripts based on component flags.


In [None]:
# ============================================================
# CELL 2: DDP SCRIPT TEMPLATE GENERATOR
# ============================================================

def generate_ddp_script(config_name, config_dict):
    """
    Generate a complete DDP training script.
    
    Args:
        config_name: Name of configuration (e.g., '09_ViT_Aug_NSL_DDP')
        config_dict: Configuration dictionary with component flags
    
    Returns:
        Complete Python script as string
    """
    
    use_cbam = config_dict.get('use_cbam', False)
    use_nsl = config_dict.get('use_nsl', False)
    use_pgd = config_dict.get('use_pgd', False)
    use_mixup = config_dict.get('use_mixup', False)
    use_cutmix = config_dict.get('use_cutmix', False)
    
    script = f'''#!/usr/bin/env python3
\"""
DDP Training Script: {config_name}
Generated: {datetime.now().isoformat()}

Usage:
    torchrun --nproc_per_node=8 \\\\
             --nnodes=1 \\\\
             --node_rank=0 \\\\
             --master_addr="localhost" \\\\
             --master_port=29500 \\\\
             {config_name.lower().replace(" ", "_")}.py
\"""

import os
import sys
import json
import pickle
import random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

import torchvision
from torchvision import transforms
from PIL import Image
import numpy as np
import timm

RANDOM_SEED = 42

# ============================================================
# COMPONENT IMPLEMENTATIONS
# ============================================================

# Dataset class
class HMDB51Dataset(Dataset):
    def __init__(self, samples, indices, transform=None, return_neighbors=False, neighbor_indices=None):
        self.samples = [samples[i] for i in indices]
        self.transform = transform
        self.return_neighbors = return_neighbors
        self.neighbor_indices = neighbor_indices
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        img_path = sample['path']
        label = sample['label']
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.return_neighbors and self.neighbor_indices is not None:
            neighbors = self.neighbor_indices[idx]
            return image, label, neighbors
        
        return image, label

# Transforms
def get_basic_aug_transform():
    return transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(degrees=10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

def get_val_transform():
    return transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

# MixUp & CutMix'''
    
    if use_mixup or use_cutmix:
        script += '''
def mixup_data(x, y, alpha=0.4):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size, device=x.device)
    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def rand_bbox(size, lam):
    W, H = size[2], size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w, cut_h = int(W * cut_rat), int(H * cut_rat)
    cx, cy = np.random.randint(W), np.random.randint(H)
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def cutmix_data(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size, device=x.device)
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    mixed_x = x.clone()
    mixed_x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(-1) * x.size(-2)))
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
'''
    
    if use_nsl:
        script += '''
# NSL Loss
def virtual_adversarial_loss(model, x, logits, xi=1e-6, eps=2.0, num_iters=1):
    d = torch.randn_like(x, requires_grad=False)
    d = d / (torch.norm(d.view(d.size(0), -1), dim=1, keepdim=True).unsqueeze(-1).unsqueeze(-1) + 1e-8)
    
    for _ in range(num_iters):
        d = d.clone().detach().requires_grad_(True)
        pred_hat = model(x + xi * d)
        logp = F.log_softmax(pred_hat, dim=1)
        p = F.softmax(logits.detach(), dim=1)
        kl = F.kl_div(logp, p, reduction='batchmean')
        kl.backward()
        d = d.grad.data.clone()
        d = d / (torch.norm(d.view(d.size(0), -1), dim=1, keepdim=True).unsqueeze(-1).unsqueeze(-1) + 1e-8)
        model.zero_grad()
    
    r_adv = eps * d.detach()
    pred_hat = model(x + r_adv)
    logp_hat = F.log_softmax(pred_hat, dim=1)
    p = F.softmax(logits.detach(), dim=1)
    vat_loss = F.kl_div(logp_hat, p, reduction='batchmean')
    return vat_loss
'''
    
    if use_cbam:
        script += '''
# CBAM Attention
class ChannelAttention(nn.Module):
    def __init__(self, in_channels, reduction=16):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.mlp = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction, in_channels, bias=False)
        )
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        b, c, _, _ = x.size()
        avg_out = self.mlp(self.avg_pool(x).view(b, c))
        max_out = self.mlp(self.max_pool(x).view(b, c))
        attention = self.sigmoid(avg_out + max_out).view(b, c, 1, 1)
        return x * attention

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        avg_out = x.mean(dim=1, keepdim=True)
        max_out = x.max(dim=1, keepdim=True)[0]
        combined = torch.cat([avg_out, max_out], dim=1)
        attention = self.sigmoid(self.conv(combined))
        return x * attention

class CBAM(nn.Module):
    def __init__(self, in_channels, reduction=16, kernel_size=7):
        super().__init__()
        self.channel_attention = ChannelAttention(in_channels, reduction)
        self.spatial_attention = SpatialAttention(kernel_size)
    
    def forward(self, x):
        x = self.channel_attention(x)
        x = self.spatial_attention(x)
        return x

class ViTWithCBAM(nn.Module):
    def __init__(self, num_classes=8, pretrained=True):
        super().__init__()
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=pretrained)
        hidden_dim = self.vit.head.in_features
        self.vit.head = nn.Identity()
        self.cbam = CBAM(hidden_dim, reduction=16)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        features = self.vit.forward_features(x)
        cls_token = features[:, 0]
        cls_reshaped = cls_token.unsqueeze(-1).unsqueeze(-1)
        attended = self.cbam(cls_reshaped)
        attended = attended.squeeze(-1).squeeze(-1)
        out = self.classifier(attended)
        return out
'''
    
    if use_pgd:
        script += '''
# PGD Adversarial Attack
def pgd_attack(model, images, labels, eps=8/255, alpha=2/255, num_steps=7, random_start=True):
    images = images.clone().detach()
    adv_images = images.clone().detach()
    
    if random_start:
        adv_images = adv_images + torch.empty_like(adv_images).uniform_(-eps, eps)
        adv_images = torch.clamp(adv_images, 0, 1)
    
    for _ in range(num_steps):
        adv_images.requires_grad = True
        outputs = model(adv_images)
        loss = F.cross_entropy(outputs, labels)
        grad = torch.autograd.grad(loss, adv_images, retain_graph=False, create_graph=False)[0]
        adv_images = adv_images.detach() + alpha * grad.sign()
        delta = torch.clamp(adv_images - images, -eps, eps)
        adv_images = torch.clamp(images + delta, 0, 1)
    
    return adv_images.detach()
'''
    
    # Main function
    epochs = config_dict.get('epochs', 15)
    batch_size = config_dict.get('batch_size', 64)
    lr = config_dict.get('learning_rate', 1e-4)
    
    script += f'''
# ============================================================
# MAIN DDP TRAINING
# ============================================================

def main():
    # DDP Setup
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = dist.get_world_size()
    torch.cuda.set_device(local_rank)
    device = torch.device(f"cuda:{{local_rank}}")
    
    is_main = local_rank == 0
    
    if is_main:
        print("="*80)
        print(f"ABLATION: {config_name}")
        print("="*80)
        print(f"World Size: {{world_size}} GPUs")
        print(f"Configuration: {config_dict}")
    
    # Set seeds
    torch.manual_seed(RANDOM_SEED + local_rank)
    np.random.seed(RANDOM_SEED + local_rank)
    random.seed(RANDOM_SEED + local_rank)
    
    # Load data
    BASE_DIR = Path('./novelty_files')
    with open(BASE_DIR / 'splits' / 'train_indices.pkl', 'rb') as f:
        train_indices = pickle.load(f)
    with open(BASE_DIR / 'splits' / 'val_indices.pkl', 'rb') as f:
        val_indices = pickle.load(f)
    
    # Load samples
    class HMDB51FightDataset(Dataset):
        def __init__(self, root_dir, split, class_to_idx):
            self.root_dir = Path(root_dir)
            self.samples = []
            split_dir = self.root_dir / split
            for class_name, class_idx in class_to_idx.items():
                class_dir = split_dir / class_name
                if class_dir.exists():
                    for img_path in list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.png')):
                        self.samples.append({{'path': str(img_path), 'label': class_idx, 'class_name': class_name}})
        def __len__(self):
            return len(self.samples)
    
    with open(BASE_DIR / 'splits' / 'class_distribution.json') as f:
        dist_data = json.load(f)
    class_to_idx = dist_data['class_to_idx']
    
    dataset_path = './fight_dataset/actions (2)/actions'
    train_loader_temp = HMDB51FightDataset(dataset_path, 'train', class_to_idx)
    test_loader_temp = HMDB51FightDataset(dataset_path, 'test', class_to_idx)
    all_samples = train_loader_temp.samples + test_loader_temp.samples
    
    # Create datasets
    train_dataset = HMDB51Dataset(all_samples, train_indices, transform=get_basic_aug_transform())
    val_dataset = HMDB51Dataset(all_samples, val_indices, transform=get_val_transform())
    
    # Create distributed samplers
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=local_rank, shuffle=True)
    val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=local_rank, shuffle=False)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size={batch_size}, sampler=train_sampler, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size={batch_size}, sampler=val_sampler, num_workers=4, pin_memory=True)
    
    if is_main:
        print(f"✓ Train: {{len(train_loader)}} batches/GPU × {{world_size}} GPUs")
        print(f"✓ Val: {{len(val_loader)}} batches/GPU × {{world_size}} GPUs")
    
    # Create model
    {'model = ViTWithCBAM(num_classes=8)' if use_cbam else 'model = timm.create_model("vit_base_patch16_224", pretrained=True, num_classes=8)'}
    model = model.to(device)
    model = DDP(model, device_ids=[local_rank])
    
    # Optimizer and scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr={lr}, weight_decay=1e-4)
    scheduler = CosineAnnealingLR(optimizer, T_max={epochs})
    
    # Training loop
    best_val_acc = 0.0
    checkpoint_path = BASE_DIR / 'checkpoints' / 'ablation' / '{config_name}.pt'
    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
    
    for epoch in range({epochs}):
        train_sampler.set_epoch(epoch)
        model.train()
        
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            '''
    
    # Add training logic based on components
    if use_pgd:
        script += '''
            # PGD adversarial training
            model.eval()
            adv_images = pgd_attack(model, images, labels)
            model.train()
            
            clean_outputs = model(images)
            adv_outputs = model(adv_images)
            
            clean_loss = criterion(clean_outputs, labels)
            adv_loss = criterion(adv_outputs, labels)
            loss = 0.5 * clean_loss + 0.5 * adv_loss
            outputs = clean_outputs
'''
    elif use_mixup or use_cutmix:
        script += '''
            # MixUp/CutMix augmentation
            if np.random.random() < 0.5:
                images, labels_a, labels_b, lam = mixup_data(images, labels)
                outputs = model(images)
                loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
'''
    else:
        script += '''
            outputs = model(images)
            loss = criterion(outputs, labels)
'''
    
    if use_nsl:
        script += '''
            # Add NSL loss
            vat_loss = virtual_adversarial_loss(model, images, outputs)
            loss = loss + 1.0 * vat_loss
'''
    
    script += '''
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        # Gather metrics
        train_loss_tensor = torch.tensor([train_loss / len(train_loader)], device=device)
        val_correct_tensor = torch.tensor([val_correct], device=device)
        val_total_tensor = torch.tensor([val_total], device=device)
        
        dist.all_reduce(train_loss_tensor, op=dist.ReduceOp.SUM)
        dist.all_reduce(val_correct_tensor, op=dist.ReduceOp.SUM)
        dist.all_reduce(val_total_tensor, op=dist.ReduceOp.SUM)
        
        avg_train_loss = (train_loss_tensor / world_size).item()
        val_acc = 100.0 * val_correct_tensor.item() / val_total_tensor.item()
        train_acc = 100.0 * train_correct / train_total
        
        scheduler.step()
        
        if is_main:
            print(f"Epoch {{epoch+1}}/{epochs}: Train Loss={{avg_train_loss:.4f}}, Train Acc={{train_acc:.2f}}%, Val Acc={{val_acc:.2f}}%")
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                checkpoint = {{
                    'epoch': epoch + 1,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'val_accuracy': val_acc,
                    'config': {config_dict}
                }}
                torch.save(checkpoint, checkpoint_path)
                print(f"✓ Best model saved (val_acc={{val_acc:.2f}}%)")
    
    if is_main:
        print(f"\\nTraining complete! Best val acc: {{best_val_acc:.2f}}%")
    
    dist.destroy_process_group()

if __name__ == '__main__':
    main()
'''
    
    return script

print("✓ DDP script generator function defined")

## CELL 3: Export Config 09 - ViT + Aug + NSL (DDP)


In [None]:
# ============================================================
# CELL 3: EXPORT CONFIG 09 (ViT + Aug + NSL + DDP)
# ============================================================

config_09 = {
    'name': '09_ViT_Aug_NSL_DDP',
    'model_type': 'vit',
    'epochs': 3,
    'batch_size': 64,
    'learning_rate': 1e-4,
    'use_basicaug': True,
    'use_mixup': True,
    'use_cutmix': True,
    'use_nsl': True,
    'use_cbam': False,
    'use_pgd': False
}

script_09 = generate_ddp_script(
    config_09['name'],
    config_09
)

script_path_09 = output_dir / 'ablation_ddp_09.py'
with open(script_path_09, 'w') as f:
    f.write(script_09)

print(\"=\"*80)
print(\"CONFIG 09: ViT + Aug + NSL (DDP)\")
print(\"=\"*80)
print(f\"✓ Exported: {script_path_09}\")
print(f\"✓ Size: {len(script_09)} characters\")
print(f\"\\nRun with:\")
print(f\"  cd {output_dir.parent}\")
print(f\"  torchrun --nproc_per_node=8 ablation_scripts/ablation_ddp_09.py\")
print(f\"\\nExpected runtime: ~6-8 hours on 8× H200 GPUs\")
print(f\"Expected accuracy: ~84-86%\")
print(\"=\"*80)

## CELL 4: Export Config 14 - Full Pipeline (Single GPU)


In [None]:
# ============================================================
# CELL 4: EXPORT CONFIG 14 (Full Pipeline - Single GPU Baseline)
# ============================================================

config_14 = {
    'name': '14_ViT_CBAM_Aug_NSL_PGD',
    'model_type': 'vit',
    'epochs': 3,
    'batch_size': 32,  # Smaller batch for single GPU + all components
    'learning_rate': 1e-4,
    'use_basicaug': True,
    'use_mixup': True,
    'use_cutmix': True,
    'use_nsl': True,
    'use_cbam': True,
    'use_pgd': True
}

# Generate single GPU version (no DDP)
script_14 = generate_ddp_script(config_14['name'], config_14)

# Remove DDP code for single GPU version
script_14_single = script_14.replace(
    'import torch.distributed as dist\\nfrom torch.nn.parallel import DistributedDataParallel as DDP',
    '# Single GPU mode - no DDP needed'
).replace(
    '    dist.init_process_group(backend=\"nccl\")',
    '    # Single GPU - skip DDP init'
).replace(
    '    local_rank = int(os.environ[\"LOCAL_RANK\"])',
    '    local_rank = 0'
).replace(
    '    world_size = dist.get_world_size()',
    '    world_size = 1'
).replace(
    '    model = DDP(model, device_ids=[local_rank])',
    '    # Single GPU - no DDP wrapping'
).replace(
    '    dist.all_reduce',
    '    # dist.all_reduce'
).replace(
    '    dist.destroy_process_group()',
    '    # No DDP to destroy'
).replace(
    'model.module.state_dict()',
    'model.state_dict()'
)

script_path_14 = output_dir / 'ablation_single_14.py'
with open(script_path_14, 'w') as f:
    f.write(script_14_single)

print(\"=\"*80)
print(\"CONFIG 14: Full Pipeline (Single GPU Baseline)\")
print(\"=\"*80)
print(f\"✓ Exported: {script_path_14}\")
print(f\"✓ Size: {len(script_14_single)} characters\")
print(f\"\\nRun with:\")
print(f\"  cd {output_dir.parent}\")
print(f\"  python ablation_scripts/ablation_single_14.py\")
print(f\"\\nExpected runtime: ~10-12 hours on single GPU\")
print(f\"Expected accuracy: ~86-88%\")
print(f\"Note: This provides single-GPU baseline for comparison with DDP Config 15\")
print(\"=\"*80)

## CELL 5: Export Config 15 - Full Pipeline (DDP)


In [None]:
# ============================================================
# CELL 5: EXPORT CONFIG 15 (Full Pipeline + DDP)
# ============================================================

config_15 = {
    'name': '15_FULL_PIPELINE',
    'model_type': 'vit',
    'epochs': 3,
    'batch_size': 64,
    'learning_rate': 1e-4,
    'use_basicaug': True,
    'use_mixup': True,
    'use_cutmix': True,
    'use_nsl': True,
    'use_cbam': True,
    'use_pgd': True  # Note: PGD with DDP - most comprehensive
}

script_15 = generate_ddp_script(
    config_15['name'],
    config_15
)

script_path_15 = output_dir / 'ablation_ddp_15.py'
with open(script_path_15, 'w') as f:
    f.write(script_15)

print(\"=\"*80)
print(\"CONFIG 15: Full Pipeline (DDP)\")
print(\"=\"*80)
print(f\"✓ Exported: {script_path_15}\")
print(f\"✓ Size: {len(script_15)} characters\")
print(f\"\\nRun with:\")
print(f\"  cd {output_dir.parent}\")
print(f\"  torchrun --nproc_per_node=8 --master_port=29501 ablation_scripts/ablation_ddp_15.py\")
print(f\"\\nExpected runtime: ~8-10 hours on 8× H200 GPUs\")
print(f\"Expected accuracy: ~87-90% (TARGET GOAL)\")
print(f\"Note: Uses different port (29501) to avoid conflicts with Config 09\")
print(\"=\"*80)

## CELL 6: Create Execution README


In [None]:
# ============================================================
# CELL 6: CREATE EXECUTION README
# ============================================================

readme_content = '''# DDP Ablation Execution Guide

## Scripts Generated

- `ablation_ddp_09.py` - Config 09: ViT + Aug + NSL + DDP
- `ablation_single_14.py` - Config 14: Full Pipeline (Single GPU)
- `ablation_ddp_15.py` - Config 15: Full Pipeline + DDP

## Prerequisites

```bash
# Ensure 8 GPUs are available
nvidia-smi

# Check DDP environment
python -c \"import torch; print(f'GPUs: {torch.cuda.device_count()}, DDP: {torch.distributed.is_available()}')\"

# Verify NCCL backend
python -c \"import torch.distributed as dist; print(f'NCCL: {dist.is_nccl_available()}')\"
```

## Execution

### Config 09: ViT + Aug + NSL (DDP - 8 GPUs)

**Expected:** ~84-86% validation accuracy  
**Runtime:** ~6-8 hours on 8× H200 GPUs

```bash
cd novelty_implementation

torchrun --nproc_per_node=8 \\
         --nnodes=1 \\
         --node_rank=0 \\
         --master_addr=\"localhost\" \\
         --master_port=29500 \\
         ablation_scripts/ablation_ddp_09.py
```

### Config 14: Full Pipeline (Single GPU Baseline)

**Expected:** ~86-88% validation accuracy  
**Runtime:** ~10-12 hours on single GPU  
**Purpose:** Single-GPU baseline for comparison with DDP Config 15

```bash
cd novelty_implementation

python ablation_scripts/ablation_single_14.py
```

### Config 15: Full Pipeline (DDP - 8 GPUs) - TARGET GOAL

**Expected:** ~87-90% validation accuracy (PROJECT GOAL)  
**Runtime:** ~8-10 hours on 8× H200 GPUs

```bash
cd novelty_implementation

torchrun --nproc_per_node=8 \\
         --nnodes=1 \\
         --node_rank=0 \\
         --master_addr=\"localhost\" \\
         --master_port=29501 \\
         ablation_scripts/ablation_ddp_15.py
```

**Note:** Uses port 29501 to avoid conflicts with Config 09

## Monitoring

### GPU Usage
```bash
# Watch GPU usage (update every 1 second)
watch -n 1 nvidia-smi

# More detailed monitoring
nvidia-smi dmon -s pucvmet
```

### Training Progress
```bash
# If running in background with nohup
tail -f nohup.out

# Or use screen/tmux for persistent sessions
screen -S ablation_09
# Then run torchrun command
# Detach: Ctrl+A, D
# Reattach: screen -r ablation_09
```

## Checkpoints

All checkpoints are saved to:
```
novelty_files/checkpoints/ablation/
├── 09_ViT_Aug_NSL_DDP.pt
├── 14_ViT_CBAM_Aug_NSL_PGD.pt
└── 15_FULL_PIPELINE.pt
```

Each checkpoint includes:
- `model_state_dict`: Model weights
- `optimizer_state_dict`: Optimizer state
- `val_accuracy`: Best validation accuracy
- `epoch`: Training epoch
- `config`: Configuration dictionary

## Troubleshooting

### Issue: NCCL timeout or communication error

**Solution:**
```bash
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=eth0  # Adjust to your network interface
```

### Issue: Out of memory (OOM)

**Solution:** Reduce batch size in the script:
- Edit `batch_size=64` → `batch_size=32` in the script
- Regenerate or manually edit the .py file

### Issue: Port already in use

**Solution:** Change the port number:
```bash
--master_port=29502  # Use a different port
```

### Issue: Different number of GPUs

**Solution:** Adjust `--nproc_per_node`:
```bash
torchrun --nproc_per_node=4 ...  # For 4 GPUs
```

## Expected Timeline

| Config | Type | Runtime | Expected Acc |
|--------|------|---------|--------------|
| 09 | DDP (8 GPU) | 6-8 hours | ~84-86% |
| 14 | Single GPU | 10-12 hours | ~86-88% |
| 15 | DDP (8 GPU) | 8-10 hours | ~87-90% ✓ |

**Total estimated time:** ~24-30 hours (if run sequentially)  
**Recommended:** Run in parallel on different nodes/terminals if hardware allows

## Validation

After training completes, verify results:

```bash
# Check checkpoint exists
ls -lh novelty_files/checkpoints/ablation/

# Load and inspect checkpoint
python -c \"
import torch
ckpt = torch.load('novelty_files/checkpoints/ablation/15_FULL_PIPELINE.pt')
print(f'Val Accuracy: {ckpt[\\"val_accuracy\\"]:.2f}%')
print(f'Epoch: {ckpt[\\"epoch\\"]}')
\"
```

## Post-Training Analysis

After all configs complete, run Notebook 8 for comprehensive evaluation and visualization.

---

Generated: {datetime.now().isoformat()}
'''

readme_path = output_dir / 'README.md'
with open(readme_path, 'w') as f:
    f.write(readme_content)

print(\"=\"*80)
print(\"EXECUTION README CREATED\")
print(\"=\"*80)
print(f\"✓ Created: {readme_path}\")
print(f\"\\nKey sections:\")
print(\"  • Prerequisites & environment setup\")
print(\"  • Execution commands for each config\")
print(\"  • Monitoring and troubleshooting\")
print(\"  • Expected timelines and accuracies\")
print(\"=\"*80)

## CELL 7: Create DDP Validation Script


In [None]:
# ============================================================
# CELL 7: CREATE DDP VALIDATION SCRIPT
# ============================================================

validation_script = '''#!/usr/bin/env python3
\"\"\"
DDP Validation Script

Tests that DDP is properly configured before running long training jobs.

Usage:
    torchrun --nproc_per_node=8 validate_ddp.py
\"\"\"

import torch
import torch.distributed as dist
from datetime import datetime

def main():
    # Initialize DDP
    dist.init_process_group(backend=\"nccl\")
    local_rank = int(os.environ.get(\"LOCAL_RANK\", 0))
    world_size = dist.get_world_size()
    
    torch.cuda.set_device(local_rank)
    device = torch.device(f\"cuda:{local_rank}\")
    
    if local_rank == 0:
        print(\"=\"*80)
        print(\"DDP VALIDATION TEST\")
        print(\"=\"*80)
        print(f\"Timestamp: {datetime.now().isoformat()}\")
        print(f\"World Size: {world_size} GPUs\")
        print(f\"Backend: NCCL\")
    
    # Test 1: Basic communication
    if local_rank == 0:
        print(\"\\nTest 1: Basic Communication\")
    
    test_tensor = torch.ones(1, device=device) * (local_rank + 1)
    dist.all_reduce(test_tensor, op=dist.ReduceOp.SUM)
    
    expected_sum = sum(range(1, world_size + 1))
    if test_tensor.item() == expected_sum:
        if local_rank == 0:
            print(f\"  ✓ All-reduce test passed (sum={test_tensor.item()})\")
    else:
        print(f\"  ✗ FAILED on rank {local_rank}: got {test_tensor.item()}, expected {expected_sum}\")
    
    # Test 2: GPU info
    if local_rank == 0:
        print(f\"\\nTest 2: GPU Information\")
        for i in range(world_size):
            props = torch.cuda.get_device_properties(i)
            print(f\"  GPU {i}: {props.name} - {props.total_memory / 1e9:.1f}GB\")
    
    # Test 3: Barrier synchronization
    if local_rank == 0:
        print(f\"\\nTest 3: Barrier Synchronization\")
    
    dist.barrier()
    
    if local_rank == 0:
        print(\"  ✓ All ranks synchronized successfully\")
    
    # Test 4: Simple model forward pass
    if local_rank == 0:
        print(f\"\\nTest 4: Simple Model Forward Pass\")
    
    model = torch.nn.Linear(10, 10).to(device)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
    
    dummy_input = torch.randn(4, 10, device=device)
    output = model(dummy_input)
    
    if local_rank == 0:
        print(f\"  ✓ DDP model forward pass successful: {output.shape}\")
    
    # Summary
    dist.barrier()
    if local_rank == 0:
        print(\"\\n\" + \"=\"*80)
        print(\"✓ ALL DDP VALIDATION TESTS PASSED\")
        print(\"=\"*80)
        print(\"Ready to run DDP training scripts!\")
        print(\"=\"*80)
    
    dist.destroy_process_group()

if __name__ == '__main__':
    import os
    main()
'''

validation_path = output_dir / 'validate_ddp.py'
with open(validation_path, 'w') as f:
    f.write(validation_script)

# Make executable
os.chmod(validation_path, 0o755)

print(\"=\"*80)
print(\"DDP VALIDATION SCRIPT CREATED\")
print(\"=\"*80)
print(f\"✓ Created: {validation_path}\")
print(f\"✓ Made executable\")
print(f\"\\nRun before training to verify DDP setup:\")
print(f\"  torchrun --nproc_per_node=8 {validation_path}\")
print(\"=\"*80)

## CELL 8: Notebook 9B Completion Summary


In [None]:
# ============================================================
# CELL 8: COMPLETION SUMMARY
# ============================================================

import json
import pandas as pd
from pathlib import Path

print(\"\\n\" + \"=\"*80)
print(\"NOTEBOOK 9B: DDP EXPORT - COMPLETION SUMMARY\")
print(\"=\"*80)

# List generated files
generated_files = list(output_dir.glob('*.py')) + list(output_dir.glob('*.md'))

print(f\"\\n✓ Generated {len(generated_files)} files in {output_dir}:\")
for file in sorted(generated_files):
    size_kb = file.stat().st_size / 1024
    print(f\"  • {file.name:30s} ({size_kb:.1f} KB)\")

print(f\"\\nConfiguration Scripts:\")
print(f\"  • ablation_ddp_09.py      - Config 09: ViT + Aug + NSL (DDP)\")
print(f\"  • ablation_single_14.py   - Config 14: Full Pipeline (Single GPU)\")
print(f\"  • ablation_ddp_15.py      - Config 15: Full Pipeline (DDP)\")

print(f\"\\nUtility Files:\")
print(f\"  • README.md               - Execution guide\")
print(f\"  • validate_ddp.py         - DDP validation test\")

print(f\"\\n\" + \"-\"*80)
print(\"NEXT STEPS\")
print(\"-\"*80)
print(\"✓ Notebook 9B COMPLETE: DDP Scripts Generated\")
print(\"\\n1. Validate DDP setup:\")
print(f\"   cd {output_dir.parent}\")
print(\"   torchrun --nproc_per_node=8 ablation_scripts/validate_ddp.py\")

print(\"\\n2. Run configurations (in order of complexity):\")
print(\"\\n   Config 09 (simpler, faster):\")
print(\"   torchrun --nproc_per_node=8 ablation_scripts/ablation_ddp_09.py\")

print(\"\\n   Config 14 (single GPU baseline):\")
print(\"   python ablation_scripts/ablation_single_14.py\")

print(\"\\n   Config 15 (final goal - all components):\")
print(\"   torchrun --nproc_per_node=8 --master_port=29501 ablation_scripts/ablation_ddp_15.py\")

print(\"\\n3. Monitor execution:\")
print(\"   watch -n 1 nvidia-smi\")

print(\"\\n4. After completion, verify results:\")
print(\"   ls -lh novelty_files/checkpoints/ablation/\")

print(f\"\\n\" + \"-\"*80)
print(\"EXPECTED OUTCOMES\")
print(\"-\"*80)
print(\"Config 09:  ~84-86% (ViT + Aug + NSL, DDP)\")
print(\"Config 14:  ~86-88% (Full pipeline, single GPU)\")
print(\"Config 15:  ~87-90% (Full pipeline, DDP) ← TARGET GOAL\")

print(f\"\\n\" + \"=\"*80)

# Save completion status
completion_status = {
    'notebook': 'Notebook 9B: DDP Export',
    'completed': True,
    'timestamp': datetime.now().isoformat(),
    'output_directory': str(output_dir),
    'generated_files': [str(f.relative_to(output_dir.parent)) for f in generated_files],
    'configurations': {
        '09': 'ViT + Aug + NSL + DDP',
        '14': 'Full Pipeline (Single GPU)',
        '15': 'Full Pipeline + DDP'
    }
}

completion_path = Path('./novelty_files/logs/notebook_09b_completion.json')
completion_path.parent.mkdir(parents=True, exist_ok=True)
with open(completion_path, 'w') as f:
    json.dump(completion_status, f, indent=2)

print(f\"✓ Completion status saved to: {completion_path}\")
print(\"=\"*80)