# Train AlexNet on CIFAR-10

In this notebook is an implementation and the training of the AlexNet Convolutional Neural Network on CIFAR-10 dataset.

In [1]:
import torch
import torch.nn as nn
from torchinfo import summary
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

from pathlib import Path
import numpy as np

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build AlexNet model

The original AlexNet proposed in the paper was designed to run on 2 GPUs due to the complexity of task at that time (2012). A re-implementation is proposed later to run ona single GPU.
Bellow are the two implementations: ``AlexNetOriginal`` and ``AlexNetNew``

In [3]:
class AlexNetOriginal(nn.Module):

    def __init__(self, num_classes: int = 1000) -> None:
        super().__init__()

        self.features = nn.Sequential(
            # layer 1
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0), # output: 96x55x55
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2), # output: 96x27x27
            # layer 2
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2), # output: 256x27x27
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2), # output: 256x13x13
            # layer 3
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1), # output: 384x13x13
            nn.ReLU(inplace=True),
            # layer 4
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1), # output: 384x13x13 
            nn.ReLU(inplace=True),
            # layer 5
            nn.Conv2d(384, 256, kernel_size=3, padding=1), # output: 265x13x13 
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2) # output: 265x6x6
        )

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 6 * 6, 4096), # output: 4096
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096), # output: 4096
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes) # output: num_classes
        )

    def forward(self, x: torch.tensor) -> torch.tensor:
        x = self.features(x)
        x = torch.flatten(x, 1)
        # x = x.reshape(x.size(0), -1)
        x = self.classifier(x)

        return x


In [4]:
class AlexNetNew(nn.Module):
    def __init__(self, num_classes: int = 1000, dropout: float = 0.5) -> None:
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

In [5]:
model_original = AlexNetOriginal()

In [6]:
model_new = AlexNetNew()

In [7]:
if torch.cuda.is_available():
    model_original.cuda()
    model_new.cuda()


Original model summary

In [8]:
summary(model_original, input_size=(1, 3, 227, 227))

Layer (type:depth-idx)                   Output Shape              Param #
AlexNetOriginal                          [1, 1000]                 --
├─Sequential: 1-1                        [1, 256, 6, 6]            --
│    └─Conv2d: 2-1                       [1, 96, 55, 55]           34,944
│    └─ReLU: 2-2                         [1, 96, 55, 55]           --
│    └─MaxPool2d: 2-3                    [1, 96, 27, 27]           --
│    └─Conv2d: 2-4                       [1, 256, 27, 27]          614,656
│    └─ReLU: 2-5                         [1, 256, 27, 27]          --
│    └─MaxPool2d: 2-6                    [1, 256, 13, 13]          --
│    └─Conv2d: 2-7                       [1, 384, 13, 13]          885,120
│    └─ReLU: 2-8                         [1, 384, 13, 13]          --
│    └─Conv2d: 2-9                       [1, 384, 13, 13]          1,327,488
│    └─ReLU: 2-10                        [1, 384, 13, 13]          --
│    └─Conv2d: 2-11                      [1, 256, 13, 13]       

New Implementation model

In [9]:
summary(model_new, input_size=(1, 3, 227, 227))

Layer (type:depth-idx)                   Output Shape              Param #
AlexNetNew                               [1, 1000]                 --
├─Sequential: 1-1                        [1, 256, 6, 6]            --
│    └─Conv2d: 2-1                       [1, 64, 56, 56]           23,296
│    └─ReLU: 2-2                         [1, 64, 56, 56]           --
│    └─MaxPool2d: 2-3                    [1, 64, 27, 27]           --
│    └─Conv2d: 2-4                       [1, 192, 27, 27]          307,392
│    └─ReLU: 2-5                         [1, 192, 27, 27]          --
│    └─MaxPool2d: 2-6                    [1, 192, 13, 13]          --
│    └─Conv2d: 2-7                       [1, 384, 13, 13]          663,936
│    └─ReLU: 2-8                         [1, 384, 13, 13]          --
│    └─Conv2d: 2-9                       [1, 256, 13, 13]          884,992
│    └─ReLU: 2-10                        [1, 256, 13, 13]          --
│    └─Conv2d: 2-11                      [1, 256, 13, 13]         

## Prepare training and validation data

Next, we will prepare our `DataLoader` with the CIFAT-10 dataset.

The input images will undergo these transformations:
 * Resize from 3x32x32 to 3x227x227
 * Normalization using mean=[0.4914, 0.4822, 0.4465] and std=[0.2023, 0.1994, 0.2010]

Training images are also augmented to improve the generalization of the model:

 * Random crops of size 32x32 with padding 4
 * Random horizontal flips

In [19]:
def get_loaders(
    data_dir: Path,
    batch_size: int,
    augment: bool,
    shuffle: bool,
    random_seed: int,
    val_size: float = 0.1,
    ):

    # Transformations

    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    ## validation transform
    valid_transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        normalize,
    ])

    ## test transform

    test_transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        normalize,
    ])

    ## Train transform
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(0.5),
            transforms.Resize((227, 227)),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        normalize,
    ])

    # Load dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=train_transform,
    )

    val_dataset = datasets.CIFAR10(
        root=data_dir ,train=True, download=True, transform=valid_transform,
    )

    test_dataset = datasets.CIFAR10(
        root=data_dir ,train=False, download=True, transform=test_transform,
    )


    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(val_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)
 
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, sampler=valid_sampler)

    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size
    )

    return train_loader, val_loader, test_loader


In [6]:
batch_size = 64
random_seed = 1
augment = True

In [20]:
train_loader, val_loader, test_loader = get_loaders(Path("./data"), batch_size=batch_size, random_seed=random_seed, augment=augment, shuffle=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


## Training our models

In [8]:
num_classes = 10
num_epochs = 20
batch_size = 64
learning_rate = 0.005
weight_decay_rate = 0.005
momentum = 0.9

In [9]:
model = AlexNetOriginal(num_classes).to(device)

In [10]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay_rate, momentum=momentum)

### Start the training

In [11]:
total_steps = len(train_loader)

In [12]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):

        # move tensors to device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs} Loss: {loss.item():.4f}]')

    # validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, preds = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
            del images, labels, outputs

        print(f'Validation accuracy: {100 * correct / total:.2f} % ')


Epoch [1/20 Loss: 1.8184]
Validation accuracy: 34.14 % 
Epoch [2/20 Loss: 1.6470]
Validation accuracy: 45.12 % 
Epoch [3/20 Loss: 1.1431]
Validation accuracy: 50.20 % 
Epoch [4/20 Loss: 1.1385]
Validation accuracy: 56.74 % 
Epoch [5/20 Loss: 0.7557]
Validation accuracy: 61.12 % 
Epoch [6/20 Loss: 1.9326]
Validation accuracy: 63.26 % 
Epoch [7/20 Loss: 0.3600]
Validation accuracy: 63.94 % 
Epoch [8/20 Loss: 1.6786]
Validation accuracy: 67.22 % 
Epoch [9/20 Loss: 1.3729]
Validation accuracy: 67.04 % 
Epoch [10/20 Loss: 1.3579]
Validation accuracy: 70.72 % 
Epoch [11/20 Loss: 0.9584]
Validation accuracy: 74.32 % 
Epoch [12/20 Loss: 0.3411]
Validation accuracy: 75.58 % 
Epoch [13/20 Loss: 0.4991]
Validation accuracy: 73.04 % 
Epoch [14/20 Loss: 0.6647]
Validation accuracy: 75.78 % 
Epoch [15/20 Loss: 1.7162]
Validation accuracy: 72.38 % 
Epoch [16/20 Loss: 0.7227]
Validation accuracy: 76.38 % 
Epoch [17/20 Loss: 0.3848]
Validation accuracy: 76.56 % 
Epoch [18/20 Loss: 0.5870]
Validation ac

In [21]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print(f'Test accuracy: {100 * correct / total:.2f} % ')

Test accuracy: 78.89 % 
