### VGG-19 Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import FakeData # Using FakeData for demonstration
from torch.utils.data import DataLoader
import time
import os
import copy
import numpy as np

# --- 1. Define VGG-19 Architecture (Configuration E) ---

# VGG configuration dictionary (channels per layer, 'M' for MaxPool)
# Configuration E: 19 weight layers (16 conv + 3 FC)
vgg19_config = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']

class VGG(nn.Module):
    def __init__(self, features, num_classes=1000, init_weights=True):
        """
        Args:
            features (nn.Module): The convolutional feature extractor block.
            num_classes (int): Number of output classes. Default is 1000 for ImageNet.
            init_weights (bool): If True, initialize weights.
        """
        super(VGG, self).__init__()
        self.features = features
        # AdaptiveAvgPool2d maps the variable spatial size output of features
        # to a fixed size (7x7) before the classifier. This makes the model
        # more flexible to input sizes, although the paper trained on fixed 224x224.
        # VGG originally uses fixed-size input and flattening.
        # Let's stick closer to the paper's implied flattening for 224x224 input.
        # The output size after 5 maxpools from 224x224 is 224 / (2^5) = 7.
        # So the input to the classifier is 512 * 7 * 7.
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) # Ensures 7x7 output for classifier
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(p=0.5), # Dropout specified in paper for first two FC layers
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x) # Use AdaptiveAvgPool2d to ensure 7x7 spatial size
        x = torch.flatten(x, 1) # Flatten the features
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # Kaiming He initialization for ReLU non-linearity
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d): # Although VGG doesn't use BN by default
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                # Normal distribution initialization for linear layers
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

def make_layers(cfg, batch_norm=False):
    """Builds the feature extractor layers based on the configuration."""
    layers = []
    in_channels = 3 # Input is RGB
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def vgg19(num_classes=1000, init_weights=True):
    """Creates a VGG-19 model instance."""
    # Note: The original VGG paper did not use Batch Normalization.
    # Setting batch_norm=False to be faithful to the paper.
    features = make_layers(vgg19_config, batch_norm=False)
    model = VGG(features, num_classes=num_classes, init_weights=init_weights)
    return model


In [None]:
# --- 2. Data Loading and Augmentation ---

# Define image size and standard ImageNet normalization constants
IMAGE_SIZE = 224
# Paper only mentions mean subtraction, but std normalization is standard practice
IMG_MEAN = [0.485, 0.456, 0.406]
IMG_STD = [0.229, 0.224, 0.225]
NUM_CLASSES = 1000 # For ImageNet (adjust if using a different dataset)

# Data augmentation and normalization for training
# Following the paper's description + standard practices:
# - RandomResizedCrop implies scale jittering (S sampled implicitly) + random crop
# - RandomHorizontalFlip
# - ColorJitter (approximation of paper's PCA color shift)
# - ToTensor
# - Normalize
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(IMAGE_SIZE, scale=(0.08, 1.0)), # Includes scale jittering
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), # Simpler color augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=IMG_MEAN, std=IMG_STD)
])

# Just normalization for validation/testing (center crop)
# Paper's test scale Q isn't explicitly defined for the simple test here.
# Using 256 resizing then 224 center crop is standard practice.
test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMG_MEAN, std=IMG_STD)
])

# --- Using FakeData ---
# Replace with actual dataset (e.g., ImageFolder) for real training
# Example using ImageFolder:
# data_dir = '/path/to/imagenet'
# train_dataset = torchvision.datasets.ImageFolder(os.path.join(data_dir, 'train'), train_transform)
# val_dataset = torchvision.datasets.ImageFolder(os.path.join(data_dir, 'val'), test_transform)
# test_dataset = torchvision.datasets.ImageFolder(os.path.join(data_dir, 'val'), test_transform) # Often use val set for testing if test labels unavailable

print("Using FakeData for demonstration purposes...")
# Create dummy datasets
train_dataset = FakeData(size=50000, image_size=(3, IMAGE_SIZE, IMAGE_SIZE), num_classes=NUM_CLASSES, transform=train_transform)
val_dataset = FakeData(size=10000, image_size=(3, IMAGE_SIZE, IMAGE_SIZE), num_classes=NUM_CLASSES, transform=test_transform)
test_dataset = FakeData(size=10000, image_size=(3, IMAGE_SIZE, IMAGE_SIZE), num_classes=NUM_CLASSES, transform=test_transform)

# --- DataLoaders ---
BATCH_SIZE = 64 # Paper uses 256, adjust based on GPU memory
NUM_WORKERS = 4 # Adjust based on system capabilities

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

dataloaders = {'train': train_loader, 'val': val_loader}
dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}

# Check device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 3. Training Loop ---

def train_model(model, criterion, optimizer, scheduler, num_epochs=74):
    """Trains the model."""
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    print(f"Starting training for {num_epochs} epochs...")
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            running_top5_corrects = 0

            # Iterate over data.
            num_batches = len(dataloaders[phase])
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                # Calculate Top-5 accuracy
                _, top5_preds = torch.topk(outputs, 5, dim=1)
                # Check if the true label is within the top 5 predictions
                top5_correct = torch.sum(top5_preds == labels.data.view(-1, 1).expand_as(top5_preds))
                running_top5_corrects += top5_correct

                # Print progress
                if (i + 1) % 100 == 0 or (i + 1) == num_batches:
                   print(f'\r{phase} Batch {i+1}/{num_batches} Loss: {loss.item():.4f}', end='')


            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            epoch_top5_acc = running_top5_corrects.double() / dataset_sizes[phase]

            print(f'\n{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} Top-5 Acc: {epoch_top5_acc:.4f}')

            history[f'{phase}_loss'].append(epoch_loss)
            history[f'{phase}_acc'].append(epoch_acc.item()) # Store as float

            # Deep copy the model if best val accuracy achieved
            if phase == 'val':
                 # Adjust learning rate based on validation loss/accuracy
                 # ReduceLROnPlateau steps based on metric (e.g., val_acc)
                 if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                     scheduler.step(epoch_acc) # or scheduler.step(epoch_loss)
                 # For StepLR or others, step happens regardless
                 elif scheduler is not None:
                      scheduler.step()

                 current_lr = optimizer.param_groups[0]['lr']
                 print(f"Current Learning Rate: {current_lr:.6f}")

                 if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    print(f"*** Best val Acc: {best_acc:.4f} achieved, saving model... ***")
                    # Save checkpoint
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': best_model_wts,
                        'optimizer_state_dict': optimizer.state_dict(),
                        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
                        'best_val_acc': best_acc,
                        'history': history,
                    }, 'vgg19_best_checkpoint.pth')


    time_elapsed = time.time() - since
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model, history

# --- Setup Model, Loss, Optimizer, Scheduler ---

# Create VGG-19 model instance
model = vgg19(num_classes=NUM_CLASSES)
model = model.to(device)

# If multiple GPUs are available, wrap the model with DataParallel
if torch.cuda.device_count() > 1:
  print(f"Using {torch.cuda.device_count()} GPUs!")
  model = nn.DataParallel(model)


# Loss Function
criterion = nn.CrossEntropyLoss()

# Optimizer (as described in the paper)
# Learning rate = 0.01, momentum = 0.9, weight decay = 5e-4
INITIAL_LR = 0.01
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4

optimizer = optim.SGD(model.parameters(), lr=INITIAL_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Learning Rate Scheduler (as described in the paper)
# Decrease LR by a factor of 10 when validation accuracy stops improving.
# ReduceLROnPlateau is suitable for this.
# 'patience': number of epochs with no improvement after which LR is reduced.
# 'factor': factor by which the learning rate will be reduced (0.1 for 10x).
lr_scheduler_plateau = lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True)

# Alternative: StepLR if you want to decrease at fixed epochs
# The paper mentions decreasing LR 3 times over 74 epochs.
# e.g., steps could be around epoch 30, 50, 65.
# lr_scheduler_step = lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)


# Select the scheduler to use
# Using ReduceLROnPlateau as it directly matches the paper's description
scheduler = lr_scheduler_plateau
# scheduler = lr_scheduler_step # Uncomment to use StepLR instead

# --- Start Training ---
# Warning: This will take a very long time on FakeData or real data without GPUs.
# Setting num_epochs to a small number for demonstration.
NUM_EPOCHS_DEMO = 5 # Set to 74 for full training attempt
print("Starting dummy training run for demonstration...")
trained_model, training_history = train_model(model, criterion, optimizer, scheduler, num_epochs=NUM_EPOCHS_DEMO)
print("Dummy training finished.")

# You can plot training history (loss/accuracy curves) using matplotlib here
# e.g., plt.plot(training_history['train_acc'], label='train_acc') ...

# --- 4. Testing Procedure ---

def test_model(model, test_loader):
    """Evaluates the model on the test set."""
    model.eval()  # Set model to evaluate mode
    running_corrects = 0
    running_top5_corrects = 0
    total_samples = 0

    print("\nStarting testing...")
    since = time.time()

    # Iterate over data.
    with torch.no_grad(): # No gradients needed for testing
        for i, (inputs, labels) in enumerate(test_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            # Statistics
            running_corrects += torch.sum(preds == labels.data)
            total_samples += labels.size(0)

            # Top-5 Accuracy
            _, top5_preds = torch.topk(outputs, 5, dim=1)
            top5_correct = torch.sum(top5_preds == labels.data.view(-1, 1).expand_as(top5_preds))
            running_top5_corrects += top5_correct

            if (i + 1) % 50 == 0 :
                print(f'\rTest Batch {i+1}/{len(test_loader)}', end='')


    test_acc = running_corrects.double() / total_samples
    test_top5_acc = running_top5_corrects.double() / total_samples

    time_elapsed = time.time() - since
    print(f'\nTesting complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Test Accuracy (Top-1): {test_acc:.4f}')
    print(f'Test Accuracy (Top-5): {test_top5_acc:.4f}')
    return test_acc.item(), test_top5_acc.item()

# --- Run Testing ---
# Ensure the best model weights are loaded if you stopped and restarted
# If 'trained_model' holds the best weights from train_model, use it directly.
# Otherwise, load from checkpoint:
# checkpoint = torch.load('vgg19_best_checkpoint.pth')
# model.load_state_dict(checkpoint['model_state_dict'])

print("\nEvaluating the best model on the test set...")
# Use the model returned by train_model which should have the best weights loaded
test_acc_top1, test_acc_top5 = test_model(trained_model, test_loader)