# Import Libraries

In [1]:
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

# Transformation

We convert the image into a PyTorch tensor and scale pixel values to the range [0.0, 1.0] by dividing by 255.

We then normalize the tensor image with mean and standard deviation per channel (Red, Green, Blue) using the formula:

$$
\text{normalized_pixel} = \frac{(\text{pixel} - \mu)}{\sigma}
$$
with $$\mu = \sigma = 0.5$$
This scales the pixel values from $[0, 1]$ to $[-1, 1]$.

Why Normalize?
Neural networks train faster and more stably when input data is standardized (mean ≈ 0, std ≈ 1).


In [2]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Get Dataset

In [3]:
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_data = torchvision.datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, num_workers=2)

100%|██████████| 170M/170M [00:14<00:00, 11.9MB/s]


**Labels**

In [4]:
class_names = ['plain', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# Model

**Convolution Output Size Formula**

Given:
- **N** = input size (height or width)
- **K** = kernel size
- **P** = padding
- **S** = stride

The output size (per dimension) is:

$$
\text{Output size} = \left\lfloor \frac{N + 2P - K}{S} \right\rfloor + 1
$$

---



This formula calculates the spatial dimension of the output after applying a convolution. Padding (`P`) controls how much the input is extended at the borders. Using `P = (K - 1) // 2` and `S = 1` helps preserve the input size.


In [5]:
class NeuralNet(nn.Module):
  def __init__(self):
    super().__init__()
    # Given the image shape of (3, 32, 32)
    self.conv1 = nn.Conv2d(3, 12, 5)  # -> (12, 28, 28)
    self.pool = nn.MaxPool2d(2, 2)  # -> (12, 14, 14)
    self.conv2 = nn.Conv2d(12, 24, 5)  # -> (24, 10, 10) -> then another pooling -> (24, 5, 5) -> then flatten -> (24 * 5 * 5)

    self.fc1 = nn.Linear(24*5*5, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = torch.flatten(x, 1)

    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)

    return x

# Train Model

In [6]:
net = NeuralNet()

**SGD and Momentum (Optimizer)**

**SGD (Stochastic Gradient Descent)** updates model parameters using gradients to minimize the loss:

$$
\theta \leftarrow \theta - \eta \cdot \nabla L(\theta)
$$

- η (learning rate), e.g., `lr=0.001`

---

**Momentum** helps speed up SGD and smooth updates by remembering past gradients:

$$
v_t = \mu \cdot v_{t-1} - \eta \cdot \nabla L(\theta)
$$
$$
\theta = \theta + v_t
$$

- μ (momentum factor), e.g., `momentum=0.9`



In [7]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [8]:
epoch_num = 20

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

NeuralNet(
  (conv1): Conv2d(3, 12, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(12, 24, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=600, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [9]:
for epoch in range(epoch_num):
  print(f'Training Epoch number {epoch}')

  net.train() # set the model to training mode
  running_loss = 0.0

  for images, labels in tqdm(train_loader, desc='Training loop'):
    images, labels = images.to(device), labels.to(device)

    optimizer.zero_grad() # Clear old gradients

    outputs = net(images)

    loss = loss_function(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

  print(f'Loss: {running_loss / len(train_loader):.4f}')

Training Epoch number 0


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 2.1903
Training Epoch number 1


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.7785
Training Epoch number 2


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.5231
Training Epoch number 3


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.3875
Training Epoch number 4


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.2838
Training Epoch number 5


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.1954
Training Epoch number 6


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.1279
Training Epoch number 7


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.0739
Training Epoch number 8


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 1.0229
Training Epoch number 9


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.9811
Training Epoch number 10


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.9385
Training Epoch number 11


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.9078
Training Epoch number 12


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.8708
Training Epoch number 13


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.8420
Training Epoch number 14


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.8123
Training Epoch number 15


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.7835
Training Epoch number 16


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.7555
Training Epoch number 17


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.7278
Training Epoch number 18


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.7038
Training Epoch number 19


Training loop:   0%|          | 0/1563 [00:00<?, ?it/s]

Loss: 0.6795


Saving **Model**

In [10]:
torch.save(net.state_dict(), 'model.pth')

# Evaluating

In [11]:
model = NeuralNet()
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [12]:
model.eval()

total = correct = 0

with torch.no_grad():
  for data in test_loader:
    images, labels = data

    outputs = model(images)

    _, predicted = torch.max(outputs, 1)

    total += labels.size(0)
    correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total

print(f'Accuracy is: {accuracy} %')

Accuracy is: 68.43 %


# Test

In [16]:
new_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [17]:
def loadImage(image_path):
  img = Image.open(image_path)
  img = new_transform(img)
  img = img.unsqueeze(0)  # Adds a batch dimension so shapes are compatible
  return img

In [18]:
img_paths = ['plain.jpg']
images = [loadImage(img_path) for img_path in img_paths]

In [26]:
model.eval()

with torch.no_grad():
  for img in images:
    outputs = model(img)
    _, predicted = torch.max(outputs, 1)

    print(f'Prediction: {class_names[predicted.item()]}')

Prediction: plain
