In [1]:
# Importing necessary libraries for neural networks and PyTorch 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Import CIFAR-10 dataset and transform from torchvision
from torchvision import datasets, transforms

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Try and run code on GPU
print(device)

cuda


In [3]:
# Data Augmentation => Improve precision and accuracy
# Transform pipelines for training and testing
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),            # Randomly flips image => For symmetry
    transforms.RandomCrop(32, padding=4),         # Randomly crops 32x32 slightly
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.2),   # Randomly adjusts brightness or contrast
    transforms.RandomRotation(15),                # Rotates between +15 and -15 degrees
    transforms.ToTensor(),                        # Converts image to Torch Tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) #Normalise image mean and standard deviation for RGB channels
])

test_transform = transforms.Compose([ 
    transforms.ToTensor(),            # Converts image to Torch Tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) #Normalise image mean and standard deviation for RGB channels
])

training_set = datasets.CIFAR10(root = './data', train=True, transform=train_transform, download=True) # Load CIFAR-10 training set with defined training transforms
test_set = datasets.CIFAR10(root = './data', train=False, transform=test_transform, download = True)   # Load CIFAR-10 test set with defined test transforms

# Create dataloader for training and test data
training_loader = torch.utils.data.DataLoader(training_set, batch_size=128, shuffle=True, num_workers=4, pin_memory=True) # Processes 128 images each time, shuffles dataset each epoch, uses 4 subprocesses to load data in parallel and uses 'pin_memory' to speed up transfer to GPU 
test_loader = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False, num_workers=4, pin_memory=True) # Processes 128 images each time, DOESN'T shuffle dataset, uses 4 subprocesses to load data in parallel and uses 'pin_memory' to speed up transfer to GPU


#Test => Gets first image and label from the training set to verify loading
first_image, first_label = training_set[0]
print("First image shape:", first_image.shape)
print("First image label:", first_label)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29272743.49it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
First image shape: torch.Size([3, 32, 32])
First image label: 6


In [4]:
# Stem: initial conv layer to extract base features
class Stem(nn.Module):
    def __init__(self, C_in=3, C_out=128):         # Takes 3 channels in (For the Colours RGB), # Outputs 128 channels (feature maps)
        super().__init__()
        self.conv = nn.Conv2d(C_in, C_out, kernel_size=3, padding=1) # Small window that slides over the image with size 3x3
        self.bn = nn.BatchNorm2d(C_out)           # Batch Normalisation -> Assist with training
        self.g = nn.ReLU()                        # 'g' => Non-linearity 
        self.dropout = nn.Dropout(0.2)            # Prevents overfitting

    def forward(self, X):
        x = self.g(self.bn(self.conv(X)))      # 1. Convolution, 2. Normalisation, 3. Activation
        return self.dropout(x)                 # Apply dropout


# Expert Branch: predicts the Soft Attention Vector = a = [a1, ..., aK] from input X
class ExpertBranch(nn.Module):
    def __init__(self, C, K=3, r=4):             # C = No. channels, K = No. Convolution Paths, r = Reduction Factor
        super().__init__()
        self.avgpool = nn.AdaptiveAvgPool2d(1)    # Spatial average pool => Reduces Spatial Dimensions to (1,1)
        self.fc1 = nn.Linear(C, C // r)           # Reduce Channels by 'r'
        self.fc2 = nn.Linear(C // r, K)           # Takes in output of 'fc1' and projects it to 'K' attention weights

    def forward(self, X):
        B, C, H, W = X.shape                      # B = Batch, C = Channels, H = Height, W = Width: to store shape of input X
        X_prime = self.avgpool(X).view(B, C)      # X_prime = AvgPool(X) and then flattens it
        hidden = F.relu(self.fc1(X_prime))        # FC1 + ReLU (activation): Shrinks to smaller version
        a = F.softmax(self.fc2(hidden), dim=1)    # FC2 + Softmax => a = [a1, a2..., aK] (Attention Weights)
        return a                                  # Returns vector 'a' with 'K' many values all adding up to 1


# ConvKBranch: applies K-many convolutional branches and combines them using 'a' (Attention Weights)
class ConvKBranch(nn.Module):
    def __init__(self, C_in, C_out, K=3):         # C_in = Channels that go in, C_out = Channels that go out, K = Different Convolutional Paths
        super().__init__()
        self.K = K                                # Storing no. paths to use in for loop
        self.ConvK = nn.ModuleList([              # Creates K-many Convolutional Layers that are 3x3
            nn.Conv2d(C_in, C_out, kernel_size=3, padding=1) # Padding = 1 => Make it same size as input
            for i in range(K)
        ])
        self.bn = nn.BatchNorm2d(C_out)           # Calculates Batch Normalisation
        self.g = nn.ReLU()                        # ReLU Activation => To keep only positive values

    def forward(self, X, a):
        final = 0                                 # Final output
        for k in range(self.K):                   # Loops over all K-many Convolutional Layers
            a_k = a[:, k].view(-1, 1, 1, 1)       # Takes 'a', vector from ExpertBranch -> Takes the k'th value for every image in the batch + reshape batch to be multiplied in conv output
            final += a_k * self.ConvK[k](X)       # Weighted Sum of K-many Convolutional Layers => Run 'X' through ConvLayer, a_k multiplied by Conv_k(X)
        return self.g(self.bn(final))             # Result = BatchNorm of final output + 'g' (activation)


# Block: Combines classes 'ExpertBranch' and 'ConvKBranch'
class Block(nn.Module):
    def __init__(self, C_in, C_out, K=3, r=4): # Takes in no. channels in and out, 'K' = No. convolutional branches, 'r' = Reduction ratio
        super().__init__()
        self.expert = ExpertBranch(C_in, K, r)         # Generate Soft Attention Vector
        self.conv_branch = ConvKBranch(C_in, C_out, K) # Find weighted sum of K-Convolutional Layers => combines both the Expert Branch and All Convolutional Branches

    def forward(self, X):
        a = self.expert(X)            # Find 'a' (Attention Weights) through Expert Branch
        O = self.conv_branch(X, a)    # Find Weighted output using 'ConvKBranch' class
        return O                      # Return Weighted Output


# Classifier: Classifier for CIFAR-10 (10 classes)
class Classifier(nn.Module):
    def __init__(self, C_in, num_classes=10):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d(1)         # Average pooling to reduce dimensions to (1,1)
        self.flatten = nn.Flatten()                 # Flatten vector to 1-Dimension
        self.dropout = nn.Dropout(0.1)              # Adds a minor dropout
        self.fc = nn.Linear(C_in, num_classes)      # Final layer

    def forward(self, X):
        X = self.pool(X).view(X.size(0), -1) # Adds pooling and reshape to (B,C)
        X = self.flatten(X)
        X = self.dropout(X)
        return self.fc(X) # Outputs logits for 10 classes


# Full CNN Model: Stem → Blocks(Expert Branch + K-many Convolutional Layers) → Classifier
class Cifar10Model(nn.Module):
    def __init__(self, N=6, num_classes=10, K=3): # 6 blocks, 10 classes, 3 Convolutional Layers
        super().__init__()
        self.stem = Stem(3, 128) # 3 for RGB and 128 feature maps
        self.backbone = nn.ModuleList([
            Block(128, 128, K=K) for i in range(N)
        ])
        self.classifier = Classifier(128, num_classes) # For final Classifier

    def forward(self, X):
        X = self.stem(X) # For Stem
        for B in self.backbone:
            X = B(X) # Pass through expert blocks
        return self.classifier(X) # Classifier


In [None]:
# Model Setup
model = Cifar10Model(N=6, num_classes=10, K=3).to(device) #6 Blocks, 10 classes, 3 Convolutional Layers
log_loss = nn.CrossEntropyLoss() # Loss Function utilised for image classification
optimiser = optim.Adam(model.parameters(), lr=0.01) # Adam optimiser with Learning Rate = 0.01

# Scheduler: drops LR by factor of 10 after every 60 epochs to aid with accuracy and tuning
scheduler = torch.optim.lr_scheduler.StepLR(optimiser, step_size=60, gamma=0.1)

# Training Loop
epochs = 120 # Total no. epochs
# Arrays to be used when plotting graphs
train_losses = []
test_losses = []
test_accuracies = []
train_accuracies = []

best_acc = 0  # Used to save the model with highest accuracy through each epoch

# Training Loop
for epoch in range(epochs):
    model.train() # Sets model to training mode => Enables Dropout and Batch Normalisation
    running_loss = 0.0 # Used to find total loss

    for inputs, labels in training_loader: # Loops through the minibatches from training data
        inputs, labels = inputs.to(device), labels.to(device) # Moves inputs and labels to GPU

        optimiser.zero_grad() # Reset gradients for each iteration
        outputs = model(inputs) # Forward pass through the model
        loss = log_loss(outputs, labels) # Compute Loss between outputs and labels
        loss.backward() # Backpropagation: Backward Pass -> Compute gradients
        optimiser.step() # Updating weights and biases

        running_loss += loss.item() # Accumulating loss

    average_loss = running_loss / len(training_loader) # Mean loss over each epoch
    train_losses.append(average_loss) # Store loss to be added to graphs

    # Evaluate on TRAINING data
    model.eval() # Set model to evaluation mode => Disables Dropout and Batch Normalisation
    train_correct = 0
    train_total = 0

    with torch.no_grad(): # Disable gradient calculation => Saves memory
        for inputs, labels in training_loader: # Loops through training data again
            inputs, labels = inputs.to(device), labels.to(device) # Moves inputs and labels to selected device (GPU/CPU)
            outputs = model(inputs) # Forward Pass through model -> Get predictions for batch
            _, predicted = torch.max(outputs, dim=1) # Take in the class with the highest score
            train_total += labels.size(0) # Sums the total no. labels
            train_correct += (predicted == labels).sum().item() # Sums the correct predictions to total

    train_accuracy = 100 * train_correct / train_total # Calculate training accuracy
    train_accuracies.append(train_accuracy) # Append to array to be used to plot graphs

    # Evaluate on TEST data
    test_correct = 0
    test_total = 0
    test_loss = 0.0

    with torch.no_grad(): # Disable gradient calculation => Saves memory
        # Used same logic for evaluating training data for test data
        for inputs, labels in test_loader: # Loops through test data
            inputs, labels = inputs.to(device), labels.to(device) # Moves inputs and labels to selected device (GPU/CPU)
            outputs = model(inputs) # Forward Pass through model -> Get predictions for batch
            loss = log_loss(outputs, labels) # Calculate test loss in the batch

            test_loss += loss.item() # Sums up batch's loss to total loss
            _, predicted = torch.max(outputs, dim=1) # Get predicted class with highest score
            test_total += labels.size(0) # Sums the total no. labels
            test_correct += (predicted == labels).sum().item() # Sums the correct predictions to total

    overall_test_loss = test_loss / len(test_loader) # Calculate average test loss over all batches
    test_accuracy = 100 * test_correct / test_total # Calculate accuracy
    test_losses.append(overall_test_loss) # Append to array for graph plotting
    test_accuracies.append(test_accuracy) # Append to array for graph plotting

    # Save the best model
    if test_accuracy > best_acc: # Check if current epoch has best test accuracy
        best_acc = test_accuracy # Update best accuracy
        torch.save(model.state_dict(), 'best_model.pth') # Save model's parameter to file
        print(f" New model with {best_acc:.2f}% accuracy")


    print(f"Epoch [{epoch + 1}/{epochs}] - "
          f"Train Loss: {average_loss:.4f} | "
          f"Test Loss: {overall_test_loss:.4f} | "
          f"Train Accuracy: {train_accuracy:.2f}% | "
          f"Test Accuracy: {test_accuracy:.2f}% ")

    # Step the scheduler
    scheduler.step()


  return F.conv2d(input, weight, bias, self.stride,


 New model with 40.26% accuracy
Epoch [1/120] - Train Loss: 1.8075 | Test Loss: 1.7106 | Train Accuracy: 37.71% | Test Accuracy: 40.26% 
 New model with 47.86% accuracy
Epoch [2/120] - Train Loss: 1.4958 | Test Loss: 1.4312 | Train Accuracy: 42.89% | Test Accuracy: 47.86% 
 New model with 57.46% accuracy
Epoch [3/120] - Train Loss: 1.2939 | Test Loss: 1.1755 | Train Accuracy: 53.21% | Test Accuracy: 57.46% 
 New model with 60.54% accuracy
Epoch [4/120] - Train Loss: 1.1554 | Test Loss: 1.1772 | Train Accuracy: 58.30% | Test Accuracy: 60.54% 
Epoch [5/120] - Train Loss: 1.0595 | Test Loss: 1.1783 | Train Accuracy: 57.59% | Test Accuracy: 59.18% 
 New model with 68.99% accuracy
Epoch [6/120] - Train Loss: 0.9750 | Test Loss: 0.8902 | Train Accuracy: 67.25% | Test Accuracy: 68.99% 
Epoch [7/120] - Train Loss: 0.9194 | Test Loss: 0.9699 | Train Accuracy: 65.46% | Test Accuracy: 66.04% 
 New model with 71.61% accuracy
Epoch [8/120] - Train Loss: 0.8615 | Test Loss: 0.8102 | Train Accuracy: 

In [None]:
import matplotlib.pyplot as plt

# Training Loss
plt.figure(figsize=(10, 4)) # Width is 10, Height is 4
plt.plot(train_losses, label='Training Loss', color='red')
plt.plot(test_losses, label='Test Loss', color='purple')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Evolution of Loss over Epochs')
plt.grid(True) # Grid to assist with reading data from graph
plt.legend()
plt.show()

# Test Accuracy
plt.figure(figsize=(10, 4))
plt.plot(test_accuracies, label='Test Accuracy (%)', color='blue')
plt.plot(train_accuracies, label='Train Accuracy (%)', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Test Accuracy over Epochs')
plt.grid(True)
plt.legend()
plt.show()