# VGG-16 PyTorch
Mayhem flows...<br>
Inspired with AlexNet succes and with understanding of how important factor of depth is for neural networks VGG networks (yep, there are multiple configurations) were presented to the world in 2014.

## Part I. Quick Overview & Goal Setting.
This network doesn't resolve vanishing gradient issue, so it's a bit hard to train (oh yay, it's deep).<br>
I don't expect something crazy here (oh and yeah, I'll use ImageNet subset with 100 classes, so it's hard to expect something great here).
Original VGG had been trained for weeks, however my Kaggle quota is not limitless;)<br>
Graph you see below is an example of VGG network architecture.<br>

<img src="https://habrastorage.org/webt/9l/mi/6-/9lmi6-8zya2_tcw8sg3gctnmtv0.png" width=100%><br>
VGG architecture is straightforward (no skip-connections or inception blocks), but it's still important to clarify how such depth could be achieved.<br>
The key is stacking multiple convolutional layers with small kernel size (3x3 is the minimal kernel size that stores spatial data, like center, right, left, top, bottom)<br>
It has its benefits:<br>
- More non-linearities (ReLUs used). After each convolutional layer we use ReLU, so instead of using ReLU once after a layer with big kernel size we use it twice or more between these stacked convolutional layers.
- Same receptive field. We compensate kernel size with quantity of layers.
- Less parameters usedl. While 2 3x3x3 convolutions use 2*27=54 parameters 1 7x7x3 uses 147 parameters.

Practically, that's it. Once again, nothing crazy or mindblowing, just smarter usage of convolutional layers.<br>

## Data.
As mentioned earlier, I'll use ImageNet100 subset of RGB images.

## Part II. Data Loading

In [None]:
!pip install torchmetrics
!pip install torchinfo

In [None]:
import torch
from torch import nn
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, ConcatDataset
from torch.utils.tensorboard import SummaryWriter

from torchmetrics import Accuracy
from torchinfo import summary
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt
import os

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
root_dir = Path("/kaggle/input/imagenet100")

In [None]:
data_transform = transforms.Compose([
    transforms.Resize(size=(256, 256)),
    transforms.RandomCrop(size=(224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
part_0 = ImageFolder(root_dir / "train.X1", transform=data_transform)
part_1 = ImageFolder(root_dir / "train.X2", transform=data_transform)
part_2 = ImageFolder(root_dir / "train.X3", transform=data_transform)
part_3 = ImageFolder(root_dir / "train.X4", transform=data_transform)

train_dataset = ConcatDataset([part_0, part_1, part_2, part_3])
test_dataset = ImageFolder(root_dir / "val.X", transform=data_transform)

In [None]:
train_dataset, test_dataset

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                              num_workers=os.cpu_count())

test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False,
                             num_workers=os.cpu_count())

train_dataloader, test_dataloader

In [None]:
for image_batch, label_batch in train_dataloader:
    image = image_batch[0]
    label = label_batch[0]

    plt.title(label)
    plt.axis("off")
    plt.imshow(image.permute(1, 2, 0));
    break

## Part III. Model

In [None]:
class VGG16(nn.Module):
    def __init__(self):
        super().__init__()

        self.convolutional = nn.Sequential(
            # 1st "block"
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            # 2nd "block"
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            # 3rd "block"
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            # 4th "block"
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            # 5th "block"
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            nn.Flatten()
        )

        self.dense = nn.Sequential(
            nn.Linear(in_features=7*7*512, out_features=4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=4096, out_features=100)
        )

    def forward(self, x):
        return self.dense(self.convolutional(x))

In [None]:
vgg = VGG16().to(device)
summary(vgg, input_size=(64, 3, 224, 224))

In [None]:
def train_step(model: nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device):
    accuracy_fn = Accuracy("multiclass", num_classes=100).to(device)
    running_loss, running_acc = 0, 0

    model.train()
    
    for image_batch, label_batch in dataloader:
        image_batch = image_batch.to(device)
        label_batch = label_batch.to(device)

        logits = model(image_batch)
        activated_pred = torch.softmax(logits, dim=1).argmax(dim=1)

        batch_accuracy = accuracy_fn(activated_pred, label_batch)
        batch_loss = loss_fn(logits, label_batch)
        running_loss += batch_loss
        running_acc += batch_accuracy

        optimizer.zero_grad()
        batch_loss.backward()
        nn.utils.clip_grad_norm_(vgg.parameters(), max_norm=10)
        optimizer.step()

    running_loss /= len(dataloader)
    running_acc /= len(dataloader)

    return running_loss, running_acc


def test_step(model: nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: nn.Module,
              device: torch.device):
    accuracy_fn = Accuracy("multiclass", num_classes=100).to(device)
    running_loss, running_acc = 0, 0

    model.eval()
    
    with torch.inference_mode():
        for image_batch, label_batch in dataloader:
            image_batch = image_batch.to(device)
            label_batch = label_batch.to(device)
    
            logits = model(image_batch)
            activated_pred = torch.softmax(logits, dim=1).argmax(dim=1)
    
            batch_accuracy = accuracy_fn(activated_pred, label_batch)
            batch_loss = loss_fn(logits, label_batch)
            running_loss += batch_loss
            running_acc += batch_accuracy
    
        running_loss /= len(dataloader)
        running_acc /= len(dataloader)
    return running_loss, running_acc

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=vgg.parameters(), momentum=0.9, weight_decay=0.0005, lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

In [None]:
EPOCHS = 20
torch.manual_seed(42)
torch.cuda.manual_seed(42)


results = {
    "train_loss": [],
    "train_acc": [],
    "test_loss": [],
    "test_acc": []
}

writer = SummaryWriter()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_step(vgg, train_dataloader, loss_fn, optimizer, device)
    test_loss, test_acc = test_step(vgg, test_dataloader, loss_fn, device)

    print(f"EPOCH: {epoch} | "
          f"tr_loss: {train_loss} | tr_acc: {train_acc}"
          f"ts_loss: {test_loss} | ts_acc: {test_acc}"
         )

    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)
    results["test_loss"].append(test_loss)
    results["test_acc"].append(test_acc)

    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/test', test_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('Accuracy/test', test_acc, epoch)

    scheduler.step()