1.   ### The The original implementation used:

Multi-GPU training     
Dense Prediction   
Model Parallelism (split across GPUs)

2.   ### Full ImageNet training used:

1.2M training images

50K validation images

Multi-scale training (256-512px random resizing)

3.   ### Modern implementations often:

Use Adam instead of SGD    
Apply batch normalization    
Use different LR schedules     

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms, datasets

class VGG16(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGG16, self).__init__()
        
        # Feature extraction (convolutional blocks)
        self.features = nn.Sequential(
            # Block 1 (64 channels)
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 224x224 -> 112x112
            
            # Block 2 (128 channels)
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 112x112 -> 56x56
            
            # Block 3 (256 channels)
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 56x56 -> 28x28
            
            # Block 4 (512 channels)
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 28x28 -> 14x14
            
            # Block 5 (512 channels)
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # 14x14 -> 7x7
        )
        
        # Classifier (fully connected layers)
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),  # Original first FC layer
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),  # Original dropout rate
            
            nn.Linear(4096, 4096),  # Original second FC layer
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            
            nn.Linear(4096, num_classes),  # Final classification layer
        )

    def forward(self, x):
        x = self.features(x)  # Conv blocks
        x = torch.flatten(x, 1)  # Flatten to [batch_size, 512*7*7]
        x = self.classifier(x)  # FC layers
        return x

# Instantiate the full model
vgg16 = VGG16()

# Print architecture summary
print(vgg16)

In [None]:


# 1. Data Loading & Preprocessing (Original Paper Setup)
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),        # Multi-scale training
    transforms.RandomHorizontalFlip(),        # Paper's augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet mean
                         std=[0.229, 0.224, 0.225])   # ImageNet std
])

# (Replace with actual ImageNet path)
train_dataset = datasets.ImageFolder(root='/path/to/imagenet/train', 
                                   transform=train_transform)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=256, shuffle=True, num_workers=4)

# 2. Model Initialization (Original Config)
model = VGG16(num_classes=1000)  # Using previous VGG16 class

# 3. Weight Initialization (Glorot/Bengio 2010)
def init_weights(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        if m.bias is not None: nn.init.constant_(m.bias, 0)
model.apply(init_weights)

# 4. Loss & Optimizer (Paper's Exact Parameters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), 
                     lr=0.01,           # Initial learning rate
                     momentum=0.9, 
                     weight_decay=5e-4)  # L2 penalty

# 5. Learning Rate Scheduler (Paper's 3-step decrease)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, 
                             patience=1, verbose=True)  # Val acc plateaus

# 6. Training Loop (Simplified version of paper's 74-epoch setup)
total_iterations = 0
for epoch in range(74):  # Total epochs from paper
    model.train()
    epoch_loss = 0
    
    for images, labels in train_loader:
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        total_iterations += 1
    
    # Validation (simplified - paper used 50K val images)
    model.eval()
    with torch.no_grad():
        # Actual validation code would go here
        val_acc = 0.0  # Placeholder for real validation
    
    # Paper's learning rate schedule
    scheduler.step(val_acc)  # Monitor validation accuracy
    
    print(f"Epoch {epoch+1}/74 | Loss: {epoch_loss/len(train_loader):.4f} | "
          f"LR: {optimizer.param_groups[0]['lr']}")

    # Early stopping at 370K iterations (74 epochs * 5007 iterations/epoch)
    if total_iterations >= 370000:
        break

# 7. Final Model Saving
torch.save(model.state_dict(), 'vgg16_imagenet.pth')