# VGG-16 Pytorch Implementation
### Code written following this tutorial blog: https://blog.paperspace.com/vgg-from-scratch-pytorch/
### VGG paper https://arxiv.org/pdf/1409.1556.pdf?ref=blog.paperspace.com

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

# DirectML for AMD compatibility
import torch_directml

In [2]:
# Device configuration
device = torch_directml.device(torch_directml.default_device())
print(device)

privateuseone:0


## Data Loaders
### Some notes on this data:
  - These data loaders create 228x228 images whereas the original VGG paper states that the images inputted to the model were 224x224
  - The images from the CIFAR-10 dataset have only a 32x32 resolution. The original VGG model was trained with the ILSVRC dataset which had an average resolution of 469x387

In [3]:
# Create a data loader for handling the CIFAR-10 dataset
  # this version of CIFAR  60,000 images with 10 different animal classes
def data_loader(data_dir,
                batch_size,
                random_seed=42,
                valid_size=0.1,
                shuffle=True,
                test=False):

    # mean and std of the rgb values in the images
    normalize = transforms.Normalize( # tutorial claimed these values available online
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    transform = transforms.Compose([
            transforms.Resize((227, 227)), # in vgg paper, images are 224x224, not sure why they chose 227 (228x228)
            transforms.ToTensor(),
            normalize,
    ])

    ### Test Data ###
    if test:
        dataset = datasets.CIFAR10(
          root=data_dir, train=False,
          download=True, transform=transform,
        )

        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader

    ### Training and Validation Data ###
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)

In [4]:
# Data loaders
train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=32)

test_loader = data_loader(data_dir='./data',
                              batch_size=32,
                              test=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


## VGG-16

Some notes on this implementation:
  - After each convolution, nn.BatchNorm2d() is used to normalize all the convolution channel outputs. However, the original paper explicitly states that local normalization does not improve performance.
  - The forward function uses out.reshape(out.size(0), -1) to change the data outputted by the convolution layers before they are sent to the linear/fully connected layers. However, there is no mention of any data transformation between these layers in the original VGG paper.

In [5]:
# Design the model's neural network
class VGG16(nn.Module): # all torch nn models must subclass nn.Module

    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()

        # Convolution layer: 3 input channels (rgb), 64 output channels, 3x3 kernel
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64), # normalize the convolution output
            nn.ReLU())  # activation function essentially throws out values < 0

        # Convolution layer: 64 input, 64 output, 3x3 kernel
        # Max pooling into a 2x2 kernel
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        # Convolution layer: 64 input, 128 output, 3x3 kernel
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())

        # Convolution layer: 128 input, 128 output, 3x3 kernel
        # Max pooling into a 2x2 kernel
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        # Convolution layer: 128 input, 256 output, 3x3 kernel
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())

        # Convolution layer: 256 input, 256 output, 3x3 kernel
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())

        # Convolution layer: 256 input, 256 output, 3x3 kernel
        # Max pooling into a 2x2 kernel
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        # Convolution layer: 256 input, 512 output, 3x3 kernel
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())

        # Convolution layer: 512 input, 512 output, 3x3 kernel
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())

        # Convolution layer: 512 input, 512 output, 3x3 kernel
        # Max pooling into a 2x2 kernel
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        # Convolution layer: 512 input, 512 output, 3x3 kernel
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())

        # Convolution layer: 512 input, 512 output, 3x3 kernel
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())

        # Convolution layer: 512 input, 512 output, 3x3 kernel
        # Max pooling into a 2x2 kernel
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7 * 7 * 512, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())

        # This is the layer that performs the classification
          # it takes the 4096 input channels from fc1 and outputs probabilities of each class in CIFAR
          # the outputs (num_classes) depend on if we classify super or fine classes in CIFAR-100 (10 or 100 classes)
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))

    # Define how the layers are connected
    def forward(self, x):
      out = self.layer1(x)
      out = self.layer2(out)
      out = self.layer3(out)
      out = self.layer4(out)
      out = self.layer5(out)
      out = self.layer6(out)
      out = self.layer7(out)
      out = self.layer8(out)
      out = self.layer9(out)
      out = self.layer10(out)
      out = self.layer11(out)
      out = self.layer12(out)
      out = self.layer13(out)
      out = out.reshape(out.size(0), -1) # Not fully sure what this is doing or if it is true to the original VGG
      out = self.fc(out)
      out = self.fc1(out)
      out = self.fc2(out)
      return out

## Training the VGG model

In [6]:
### Hyper Parameters ###

num_classes = 10
num_epochs = 10
learning_rate = 0.005

model = VGG16(num_classes).to(device) # use GPU


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)

total_step = len(train_loader)

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total number of parameters in this implementation of VGG-16: {total_params}')

Total number of parameters in this implementation of VGG-16: 134309962


In [8]:
# Train the model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
        
    # Validation
      # a validation dataset allows us to see model progress along the way while saving our true test data for the end
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/10], Step [1/1407], Loss: 2.3568
Epoch [1/10], Step [2/1407], Loss: 2.2859
Epoch [1/10], Step [3/1407], Loss: 2.3691
Epoch [1/10], Step [4/1407], Loss: 2.3866
Epoch [1/10], Step [5/1407], Loss: 2.4249
Epoch [1/10], Step [6/1407], Loss: 2.3476
Epoch [1/10], Step [7/1407], Loss: 2.5554
Epoch [1/10], Step [8/1407], Loss: 2.2738
Epoch [1/10], Step [9/1407], Loss: 2.2556
Epoch [1/10], Step [10/1407], Loss: 2.3521
Epoch [1/10], Step [11/1407], Loss: 2.4023
Epoch [1/10], Step [12/1407], Loss: 2.5231
Epoch [1/10], Step [13/1407], Loss: 2.2868
Epoch [1/10], Step [14/1407], Loss: 2.2056
Epoch [1/10], Step [15/1407], Loss: 2.6654
Epoch [1/10], Step [16/1407], Loss: 2.7132
Epoch [1/10], Step [17/1407], Loss: 2.5159
Epoch [1/10], Step [18/1407], Loss: 2.4275
Epoch [1/10], Step [19/1407], Loss: 2.5637
Epoch [1/10], Step [20/1407], Loss: 2.4916
Epoch [1/10], Step [21/1407], Loss: 2.2815
Epoch [1/10], Step [22/1407], Loss: 2.3329
Epoch [1/10], Step [23/1407], Loss: 2.4047
Epoch [1/10], Step [

## Testing

In [9]:
# Evaluation on the test dataset
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))

Accuracy of the network on the 10000 test images: 81.16 %


In [10]:
torch.save(model.state_dict(), './VGG-16_CIFAR-10_228x228.pt')

### VGG-16 with 128x128 images 10 epochs on CIFAR-10
    ~82.61%
### VGG-16 with 228x228 images 10 epochs on CIFAR-10
    ~81.16%