# Effects of Model Complexity and Structure with Image Classification

By: Alexander Jiang, Christopher Setiabudi, Tanvir Rasul

## Model Card

Task Input: 3x32x32 Image

Task Output: Class (integer representing 1 of 10 classes)

Training Dataset: CIFAR-10

Intended Use: Classify images into one of ten classes

Risks: None we can think of

## Introduction

With an image classification task, we will analyze the performance and efficiency of two computer vision architectures, given limited computational resources and time. We will be implementing a traditional Convolutional Neural Network (CNN) and a pre-trained Vision Transformer. The dataset consists of 3x32x32 images, with an integer label from 0-9 (10 classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck).

In [None]:
trainingFlag = True

## Install and import libraries

In [None]:
# Import necessary packages
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Hyperparameters
batch_size = 25
lr = 0.001
num_epochs = 100

## Download and prepare dataset

In [None]:
# Load the data
train = datasets.CIFAR10(root='./data', train=True, download=True)
trainDL = DataLoader(train, batch_size=batch_size, shuffle=True)

test = datasets.CIFAR10(root='./data', train=False, download=True)
testDL = DataLoader(test, batch_size=batch_size, shuffle=False)

In [None]:
# Define label mappings for ease of use
id2label, label2id = dict(), dict()

for id, label in enumerate(trainDL.dataset.classes):
    id2label[id] = label
    label2id[label] = id

In [None]:
# Display some images
fig, axes = plt.subplots(1, 5)

for i, (image, label) in enumerate(trainDL):
    if i == 5:
        break

    image = image[0].permute(1, 2, 0).numpy()

    axes[i].imshow(image)
    axes[i].set_title(id2label[label[0].item()])
    axes[i].axis('off')
    print(image.shape)

plt.show()

In [None]:
# Apply the transforms (not before so the display can show the images normally)
trainDL.dataset.transform = transforms.Compose([
    transforms.Resize(size=(227, 227)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5)),
])

testDL.dataset.transform = transforms.Compose([
    transforms.Resize(size=(227, 227)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5)),
])

In [None]:
# Here we are implementing the AlexNet architecture according to this paper: https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf

class CIFARCNN(nn.Module):
  def __init__(self, num_classes):
    super().__init__()

    # Input size: (batch_size x 3 x 227 x 227)

    self.features = nn.Sequential(
        nn.Conv2d(3, 96, kernel_size=11, stride=4),  # (batch_size x 96 x 55 x 55)
        nn.BatchNorm2d(96),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),  # (batch_size x 96 x 27 x 27)

        nn.Conv2d(96, 256, kernel_size=5, padding=2),  # (batch_size x 256 x 27 x 27)
        nn.BatchNorm2d(256),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),  # (batch_size x 256 x 13 x 13)

        nn.Conv2d(256, 384, kernel_size=3, padding=1),  # (batch_size x 384 x 13 x 13)
        nn.BatchNorm2d(384),
        nn.ReLU(),

        nn.Conv2d(384, 384, kernel_size=3, padding=1), # (batch_size x 384 x 13 x 13)
        nn.BatchNorm2d(384),
        nn.ReLU(),

        nn.Conv2d(384, 256, kernel_size=3, padding=1),  # (batch_size x 256 x 13 x 13)
        nn.BatchNorm2d(256),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2)  # (batch_size x 256 x 6 x 6)
    )

    self.classifier = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(256 * 6 * 6, 4096),
        nn.ReLU(),

        nn.Dropout(0.5),
        nn.Linear(4096, 4096),
        nn.ReLU(),

        nn.Linear(4096, num_classes)
    )

    self.finalActivation = nn.Softmax()

  def forward(self, x):
    x = self.features(x)

    x = x.reshape(x.shape[0], -1)

    logits = self.classifier(x)
    # pred = self.finalActivation(logits)
    return logits

In [None]:
# Set device
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backend.mps.is_available() else "cpu")
print(f"Device: {device}")

# Instantiate model, optimizer, and loss function
model = CIFARCNN(len(id2label)).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

In [None]:
# Training Loop for AlexNet

if trainingFlag == True:
  total_steps = len(trainDL)

  for epoch in range(num_epochs):
    for step, (X, Y) in enumerate(trainDL):
      # Move data to same device as model
      X, Y = X.to(device), Y.to(device)

      # Predict and calculate loss
      Y_pred = model(X)
      loss = loss_fn(Y_pred, Y)

      # Perform backpropagation
      optimizer.zero_grad()  # The optimizer stores any calculated gradients, and in this case we do not use previously calculated gradients so we clear it
      loss.backward()
      optimizer.step()

      # Print progress
      if step % 100 == 0:
        loss_val = loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step}/{total_steps}], Loss: {loss_val:5f}")

    # Validation
    with torch.no_grad():
      correct, total = 0, 0

      for X, Y in testDL:
        X, Y = X.to(device), Y.to(device)

        Y_pred = model(X)

        _, predicted = torch.max(Y_pred.data, 1)

        correct += (predicted == Y).sum().item()
        total += Y.shape[0]

        del X, Y, Y_pred

      print(f"Accuracy: {correct}/{total} ({correct/total * 100:3f}%)")