In [34]:
import torch
from torch import nn
import matplotlib.pyplot as plt
import torchvision
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import torch.nn.functional as F

In [2]:
train_data = datasets.CIFAR10(
  root="datasets",
  train=True,
  download=True,
  transform=torchvision.transforms.ToTensor(),
  target_transform=None
)
test_data = datasets.CIFAR10(
  root="datasets",
  train=False,
  download=True,
  transform=torchvision.transforms.ToTensor(),
  target_transform=None
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to datasets/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:11<00:00, 14489111.96it/s]


Extracting datasets/cifar-10-python.tar.gz to datasets
Files already downloaded and verified


In [3]:
batch_size = 32
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [4]:
def acc(y_pred, y_true):
  if len(y_pred) != len(y_true):
    print("Error: y_pred and y_true have different lengths. Aborting")
  return torch.eq(y_pred, y_true).sum().item() / len(y_pred)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Conv Model

In [12]:
class ConvModelV1(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden_units = 10
    self.classes = 10

    self.conv_block = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=self.hidden_units, kernel_size=(3,3), stride=1, padding=1),
      nn.ReLU(),
      nn.Conv2d(self.hidden_units, self.hidden_units, (3, 3), stride=1, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=(2,2)),
    )


    self.classifier = nn.Sequential(
      nn.Flatten(),
      nn.Linear(2560, self.classes)
    )


  def forward(self, x):
    return self.classifier(
        self.conv_block(x)
    )


model = ConvModelV1().to(device)
print(model)
print(next(model.parameters()).device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

ConvModelV1(
  (conv_block): Sequential(
    (0): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=2560, out_features=10, bias=True)
  )
)
cuda:0


In [None]:
with torch.inference_mode():
  a = model(next(iter(train_dataloader))[0])
  print(a)

torch.Size([32, 10, 16, 16])
tensor([[ 0.2217, -0.0914, -0.1704,  0.1009,  0.1103,  0.1113,  0.0022, -0.1539,
          0.0413, -0.1189],
        [ 0.2158, -0.1384, -0.1459,  0.0386,  0.0530,  0.0665,  0.0091, -0.1206,
          0.0588, -0.1206],
        [ 0.2822, -0.1378, -0.2089,  0.0933,  0.2186,  0.1051,  0.0212, -0.1931,
          0.0644, -0.1674],
        [ 0.2392, -0.1301, -0.2212,  0.1034,  0.0957,  0.1186,  0.0031, -0.1226,
          0.0542, -0.1736],
        [ 0.3098, -0.0505, -0.2233,  0.0073,  0.1092,  0.0408,  0.0818, -0.1490,
          0.0712, -0.0911],
        [ 0.2597, -0.1017, -0.1593,  0.1160,  0.1165,  0.1186,  0.0376, -0.1650,
          0.0753, -0.1177],
        [ 0.3667, -0.1505, -0.1988,  0.1253,  0.1978,  0.1187,  0.0439, -0.2342,
          0.0702, -0.1779],
        [ 0.3079, -0.1046, -0.1436,  0.0490,  0.0944,  0.0854,  0.0445, -0.2130,
          0.0274, -0.0821],
        [ 0.3239, -0.0933, -0.1703,  0.1353,  0.1101,  0.1185, -0.0048, -0.1624,
          0.0529, 

In [13]:
model.train()
epochs = 3
for epoch in tqdm(range(epochs)):
  print(f"epoch {epoch} ---")
  epoch_loss = 0

  for i, (image, label) in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):

    logits = model(image.to(device))

    loss = loss_fn(logits, label.to(device))

    epoch_loss += loss
    if i % 300 == 0:
      print(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print("---")
  print("epoch loss: ", epoch_loss.item() / len(train_dataloader))


model.eval();

  0%|          | 0/3 [00:00<?, ?it/s]

epoch 0 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

2.3020846843719482
1.826447606086731
1.6687887907028198
1.6600818634033203
1.4306728839874268
1.0920125246047974
---
epoch loss:  1.5251019673704416
epoch 1 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.2762987613677979
1.3685530424118042
1.2887663841247559
1.0882114171981812
1.3721423149108887
0.8839101195335388
---
epoch loss:  1.2130511205164347
epoch 2 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.1497267484664917
1.5501399040222168
1.0183708667755127
0.9944034218788147
1.0090738534927368
1.3511391878128052
---
epoch loss:  1.0977076398052623


In [14]:
with torch.inference_mode():
  avg_acc = 0
  avg_loss = 0
  for i, (image, label) in enumerate(tqdm(test_dataloader)):
    label = label.to(device)

    logits = model(image.to(device))
    loss = loss_fn(logits, label)
    accuracy = acc(torch.argmax(logits, dim=1), label)

    avg_loss += loss
    avg_acc += accuracy
    if i % 30 == 0:
      print(f"{i:5} | {accuracy*100:7.2f}% | {loss.item():.5f}")


  print(avg_acc / len(test_dataloader)*100, avg_loss.item() / len(test_dataloader))

  0%|          | 0/313 [00:00<?, ?it/s]

    0 |   68.75% | 0.83777
   30 |   81.25% | 0.86547
   60 |   50.00% | 1.33944
   90 |   68.75% | 1.12451
  120 |   62.50% | 1.08735
  150 |   62.50% | 1.12434
  180 |   78.12% | 0.69164
  210 |   62.50% | 1.07089
  240 |   71.88% | 0.89892
  270 |   53.12% | 1.30401
  300 |   56.25% | 1.27409
61.2020766773163 1.1120626918804912


## Bigger conv model

In [7]:
class ConvModelV2(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden_units = 20
    self.classes = 10

    self.conv_block = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=self.hidden_units, kernel_size=(2,2), stride=1, padding=0),
      nn.ReLU(),
      nn.Conv2d(self.hidden_units, self.hidden_units, (2, 2), stride=1, padding=0),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=(2,2)),
    )

    self.conv_block2 = nn.Sequential(
      nn.Conv2d(in_channels=self.hidden_units, out_channels=self.hidden_units, kernel_size=(2,2), stride=1, padding=0),
      nn.ReLU(),
      nn.Conv2d(self.hidden_units, self.hidden_units, (2, 2), stride=1, padding=0),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=(2,2)),
    )

    self.classifier = nn.Sequential(
      nn.Flatten(),
      nn.Linear(720, self.classes)
    )


  def forward(self, x):
    return self.classifier(
      self.conv_block2(
        self.conv_block(x)
      )
    )


model = ConvModelV2().to(device)
print(model)
print(next(model.parameters()).device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

ConvModelV2(
  (conv_block): Sequential(
    (0): Conv2d(3, 20, kernel_size=(2, 2), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 20, kernel_size=(2, 2), stride=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block2): Sequential(
    (0): Conv2d(20, 20, kernel_size=(2, 2), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 20, kernel_size=(2, 2), stride=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=720, out_features=10, bias=True)
  )
)
cuda:0


In [None]:
with torch.inference_mode():
  a = model(next(iter(train_dataloader))[0])
  print(a)

tensor([[-0.0175, -0.0654, -0.0197,  0.0054, -0.0372,  0.0137,  0.0453,  0.0593,
         -0.0132, -0.0047],
        [-0.0190, -0.0732, -0.0171,  0.0015, -0.0410,  0.0132,  0.0459,  0.0627,
         -0.0118,  0.0031],
        [-0.0252, -0.0764, -0.0201, -0.0048, -0.0403,  0.0138,  0.0483,  0.0593,
         -0.0030, -0.0018],
        [-0.0265, -0.0678, -0.0248,  0.0007, -0.0435,  0.0078,  0.0362,  0.0609,
         -0.0043, -0.0015],
        [-0.0209, -0.0686, -0.0290,  0.0137, -0.0435,  0.0003,  0.0463,  0.0531,
         -0.0165,  0.0077],
        [-0.0235, -0.0721, -0.0236,  0.0009, -0.0412,  0.0124,  0.0449,  0.0598,
         -0.0129,  0.0072],
        [-0.0117, -0.0704, -0.0232,  0.0031, -0.0511,  0.0069,  0.0476,  0.0650,
         -0.0105,  0.0099],
        [-0.0177, -0.0654, -0.0215,  0.0097, -0.0431,  0.0037,  0.0503,  0.0615,
         -0.0153,  0.0033],
        [-0.0167, -0.0658, -0.0286,  0.0046, -0.0461,  0.0066,  0.0470,  0.0573,
         -0.0079,  0.0027],
        [-0.0227, -

In [10]:
model.train()
epochs = 3
for epoch in tqdm(range(epochs)):
  print(f"epoch {epoch} ---")
  epoch_loss = 0

  for i, (image, label) in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):


    logits = model(image.to(device))

    loss = loss_fn(logits, label.to(device))

    epoch_loss += loss
    if i % 300 == 0:
      print(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print("---")
  print("epoch loss: ", epoch_loss.item() / len(train_dataloader))


model.eval();

  0%|          | 0/3 [00:00<?, ?it/s]

epoch 0 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

2.303969383239746
2.1157214641571045
1.4512872695922852
1.7039430141448975
1.364967942237854
1.7846678495407104
---
epoch loss:  1.692078847268874
epoch 1 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.6020017862319946
1.194014072418213
1.3029109239578247
1.0240739583969116
1.2562499046325684
1.4426417350769043
---
epoch loss:  1.3900083223368522
epoch 2 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

0.8687392473220825
1.3651032447814941
1.0175942182540894
1.4854764938354492
1.2377029657363892
1.3382673263549805
---
epoch loss:  1.2920742706084054


In [11]:
with torch.inference_mode():
  avg_acc = 0
  avg_loss = 0
  for i, (image, label) in enumerate(tqdm(test_dataloader)):
    label = label.to(device)
    logits = model(image.to(device))
    loss = loss_fn(logits, label)
    accuracy = acc(torch.argmax(logits, dim=1), label)

    avg_loss += loss
    avg_acc += accuracy
    if i % 30 == 0:
      print(f"{i:5} | {accuracy*100:7.2f}% | {loss.item():.5f}")


  print(avg_acc / len(test_dataloader)*100, avg_loss.item() / len(test_dataloader))

  0%|          | 0/313 [00:00<?, ?it/s]

    0 |   56.25% | 1.03244
   30 |   62.50% | 1.09159
   60 |   40.62% | 1.38521
   90 |   56.25% | 1.28236
  120 |   56.25% | 1.13706
  150 |   65.62% | 1.28469
  180 |   68.75% | 1.00565
  210 |   59.38% | 1.28600
  240 |   65.62% | 1.03316
  270 |   50.00% | 1.71573
  300 |   56.25% | 1.16833
53.09504792332268 1.3179007338258786


## Architecture similar to LeNet-5

In [37]:
class LikeLeNet5Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden_units_one = 6
    self.hidden_units_two = 16
    self.classes = 10

    self.conv_block = nn.Sequential(
      nn.Conv2d(in_channels=3, out_channels=self.hidden_units_one, kernel_size=(4,4), stride=1, padding=0),
      nn.Sigmoid(),
      nn.MaxPool2d(kernel_size=(2,2)),
      nn.Conv2d(self.hidden_units_one, self.hidden_units_two, (4, 4), stride=1, padding=1),
      nn.Sigmoid(),
      nn.MaxPool2d(kernel_size=(2,2)),
    )

    self.classifier = nn.Sequential(
      nn.Flatten(),
      nn.Linear(self.hidden_units_two * 6 * 6, 120),
      nn.Sigmoid(),
      nn.Linear(120, 84),
      nn.Sigmoid(),
      nn.Linear(84, self.classes),
    )


  def forward(self, x):
    return self.classifier(
      self.conv_block(x)
    )


model = LikeLeNet5Model().to(device)
print(model)
print(next(model.parameters()).device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

LeNet5(
  (conv_block): Sequential(
    (0): Conv2d(3, 6, kernel_size=(4, 4), stride=(1, 1))
    (1): Sigmoid()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 16, kernel_size=(4, 4), stride=(1, 1), padding=(1, 1))
    (4): Sigmoid()
    (5): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=576, out_features=120, bias=True)
    (2): Sigmoid()
    (3): Linear(in_features=120, out_features=84, bias=True)
    (4): Sigmoid()
    (5): Linear(in_features=84, out_features=10, bias=True)
  )
)
cuda:0


In [31]:
with torch.inference_mode():
  a = model(next(iter(train_dataloader))[0].to(device))

In [38]:
model.train()
epochs = 3
for epoch in tqdm(range(epochs)):
  print(f"epoch {epoch} ---")
  epoch_loss = 0

  for i, (image, label) in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):

    logits = model(image.to(device))

    loss = loss_fn(logits, label.to(device))

    epoch_loss += loss
    if i % 300 == 0:
      print(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print("---")
  print("epoch loss: ", epoch_loss.item() / len(train_dataloader))


model.eval();

  0%|          | 0/3 [00:00<?, ?it/s]

epoch 0 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

2.2821221351623535
2.293759346008301
2.103975534439087
1.9150667190551758
2.0252902507781982
1.9091668128967285
---
epoch loss:  2.0779852009856845
epoch 1 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.9403049945831299
2.0616042613983154
2.0246095657348633
1.8236210346221924
1.9843543767929077
1.6893304586410522
---
epoch loss:  1.8939314419385798
epoch 2 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.4997074604034424
1.888482928276062
1.6562221050262451
1.9387191534042358
1.6977989673614502
1.778357744216919
---
epoch loss:  1.7925012433521272


In [39]:
with torch.inference_mode():
  avg_acc = 0
  avg_loss = 0
  for i, (image, label) in enumerate(tqdm(test_dataloader)):
    label = label.to(device)
    logits = model(image.to(device))
    loss = loss_fn(logits, label)
    accuracy = acc(torch.argmax(logits, dim=1), label)

    avg_loss += loss
    avg_acc += accuracy
    if i % 30 == 0:
      print(f"{i:5} | {accuracy*100:7.2f}% | {loss.item():.5f}")


  print(avg_acc / len(test_dataloader)*100, avg_loss.item() / len(test_dataloader))

  0%|          | 0/313 [00:00<?, ?it/s]

    0 |   50.00% | 1.48474
   30 |   50.00% | 1.39480
   60 |   31.25% | 1.84966
   90 |   31.25% | 1.83691
  120 |   34.38% | 1.67803
  150 |   40.62% | 1.63219
  180 |   62.50% | 1.43763
  210 |   43.75% | 1.71195
  240 |   50.00% | 1.37172
  270 |   37.50% | 1.77401
  300 |   46.88% | 1.50646
37.00079872204473 1.7134465714232228


In [40]:
class AnotherLeNet5LikeModel(nn.Module):
  def __init__(self):
    super().__init__()
      self.conv_block = nn.Sequential(
        nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, stride=1, padding=0),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
        nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
      )

      self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(16*5*5, 120),
        nn.ReLU(),
        nn.Linear(120, 84),
        nn.ReLU(),
        nn.Linear(84, 10),
      )


  def forward(self, x):
    return self.classifier(
      self.conv_block(x)
    )

model = AnotherLeNet5LikeModel().to(device)
print(model)
print(next(model.parameters()).device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())

AnotherLeNet5LikeModel(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool1): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (pool2): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
cuda:0


In [41]:
model.train()
epochs = 3
for epoch in tqdm(range(epochs)):
  print(f"epoch {epoch} ---")
  epoch_loss = 0

  for i, (image, label) in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):

    logits = model(image.to(device))

    loss = loss_fn(logits, label.to(device))

    epoch_loss += loss
    if i % 300 == 0:
      print(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print("---")
  print("epoch loss: ", epoch_loss.item() / len(train_dataloader))


model.eval();

  0%|          | 0/3 [00:00<?, ?it/s]

epoch 0 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

2.3195013999938965
1.9647594690322876
1.4856141805648804
1.8416186571121216
1.584322214126587
1.5057258605957031
---
epoch loss:  1.777040097168906
epoch 1 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.3409059047698975
1.4685232639312744
1.437370777130127
1.444777250289917
1.3102904558181763
1.5581027269363403
---
epoch loss:  1.497771181971769
epoch 2 ---


  0%|          | 0/1563 [00:00<?, ?it/s]

1.5023056268692017
1.059338927268982
1.1642560958862305
1.5081483125686646
1.2046085596084595
1.3792827129364014
---
epoch loss:  1.3944757989943217


In [42]:
with torch.inference_mode():
  avg_acc = 0
  avg_loss = 0
  for i, (image, label) in enumerate(tqdm(test_dataloader)):
    label = label.to(device)
    logits = model(image.to(device))
    loss = loss_fn(logits, label)
    accuracy = acc(torch.argmax(logits, dim=1), label)

    avg_loss += loss
    avg_acc += accuracy
    if i % 30 == 0:
      print(f"{i:5} | {accuracy*100:7.2f}% | {loss.item():.5f}")


  print(avg_acc / len(test_dataloader)*100, avg_loss.item() / len(test_dataloader))

  0%|          | 0/313 [00:00<?, ?it/s]

    0 |   59.38% | 1.14196
   30 |   71.88% | 1.08590
   60 |   31.25% | 1.46343
   90 |   56.25% | 1.43438
  120 |   40.62% | 1.33855
  150 |   56.25% | 1.40860
  180 |   75.00% | 0.98232
  210 |   56.25% | 1.37613
  240 |   65.62% | 0.88519
  270 |   43.75% | 1.68513
  300 |   50.00% | 1.36036
51.06829073482429 1.348482662115615
