# Assignment 1 - Code Example - Part A

This code baseline is inspired by and modified from [this great tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

This code can achieve an accuracy of approximately 86.50% on CIFAR-10. Please set up the environment and run your experiments starting from this baseline. You are expected to achieve an accuracy higher than this baseline.

In [1]:
# import some necessary packages
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms

Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
if torch.cuda.is_available():
    print("CUDA Version:", torch.version.cuda)
    print("CUDA Device:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

print(f"PyTorch Version: {torch.__version__}")
import torchvision
print(f"torchvision Version: {torchvision.__version__}")

CUDA Version: 12.1
CUDA Device: Tesla T4
PyTorch Version: 2.5.1+cu121
torchvision Version: 0.20.1+cu121


## 1. Baseline Model
### 1.1 Settings

In [3]:
# prepare datasets
# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
    
num_workers = 2
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:03<00:00, 48.5MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [2]:
# some experimental setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
batch_size = 64
print_every = 200

optim_name = "Adam"
optim_kwargs = dict(
    lr=3e-4,
    weight_decay=1e-6,
)

In [4]:
# our network architecture
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 10),
)

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 7.28M


### 1.2 Start Training

In [5]:
# the network optimizer
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# loss function
criterion = nn.CrossEntropyLoss()

# training loop
net.train()
for epoch in range(num_epochs):

    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0

print("Finished Training")

[epoch=  1, iter=  200] loss: 2.196
[epoch=  1, iter=  400] loss: 1.976
[epoch=  1, iter=  600] loss: 1.884
[epoch=  2, iter=  200] loss: 1.684
[epoch=  2, iter=  400] loss: 1.612
[epoch=  2, iter=  600] loss: 1.545
[epoch=  3, iter=  200] loss: 1.445
[epoch=  3, iter=  400] loss: 1.403
[epoch=  3, iter=  600] loss: 1.363
[epoch=  4, iter=  200] loss: 1.293
[epoch=  4, iter=  400] loss: 1.256
[epoch=  4, iter=  600] loss: 1.248
[epoch=  5, iter=  200] loss: 1.178
[epoch=  5, iter=  400] loss: 1.156
[epoch=  5, iter=  600] loss: 1.149
[epoch=  6, iter=  200] loss: 1.075
[epoch=  6, iter=  400] loss: 1.081
[epoch=  6, iter=  600] loss: 1.094
[epoch=  7, iter=  200] loss: 1.006
[epoch=  7, iter=  400] loss: 1.005
[epoch=  7, iter=  600] loss: 1.000
[epoch=  8, iter=  200] loss: 0.973
[epoch=  8, iter=  400] loss: 0.963
[epoch=  8, iter=  600] loss: 0.931
[epoch=  9, iter=  200] loss: 0.907
[epoch=  9, iter=  400] loss: 0.891
[epoch=  9, iter=  600] loss: 0.905
[epoch= 10, iter=  200] loss

### 1.3 Evaluating its accuracy

In [6]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # make prediction
        pred = net(img)
        
        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

Accuracy of the network on the 10000 test images: 86.97%


## 2. Variant 1
Changing the NN architecture.

In [19]:
# prepare datasets
# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
    
num_workers = 2
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


Files already downloaded and verified
Files already downloaded and verified


In [20]:
# some experimental setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
batch_size = 64
print_every = 200

optim_name = "Adam"
optim_kwargs = dict(
    lr=3e-4,
    weight_decay=1e-6,
)

In [21]:
# our network architecture
# 定义基本的残差块
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != self.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, self.expansion * out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * out_channels)
            )

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = self.relu(out)
        return out

# 定义ResNet34模型
class ResNet34(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet34, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(64, 3, stride=1)
        self.layer2 = self._make_layer(128, 4, stride=2)
        self.layer3 = self._make_layer(256, 6, stride=2)
        self.layer4 = self._make_layer(512, 3, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(BasicBlock(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * BasicBlock.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out


net = ResNet34(num_classes=10)

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 21.29M


In [None]:
# the network optimizer
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# loss function
criterion = nn.CrossEntropyLoss()

# training loop
net.train()
for epoch in range(num_epochs):

    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0

print("Finished Training")

[epoch=  1, iter=  200] loss: 1.871
[epoch=  1, iter=  400] loss: 1.577
[epoch=  1, iter=  600] loss: 1.471
[epoch=  2, iter=  200] loss: 1.320
[epoch=  2, iter=  400] loss: 1.275
[epoch=  2, iter=  600] loss: 1.216
[epoch=  3, iter=  200] loss: 1.145
[epoch=  3, iter=  400] loss: 1.115
[epoch=  3, iter=  600] loss: 1.105
[epoch=  4, iter=  200] loss: 1.019
[epoch=  4, iter=  400] loss: 1.037
[epoch=  4, iter=  600] loss: 0.998
[epoch=  5, iter=  200] loss: 0.950
[epoch=  5, iter=  400] loss: 0.956
[epoch=  5, iter=  600] loss: 0.948
[epoch=  6, iter=  200] loss: 0.892
[epoch=  6, iter=  400] loss: 0.888
[epoch=  6, iter=  600] loss: 0.873
[epoch=  7, iter=  200] loss: 0.837
[epoch=  7, iter=  400] loss: 0.829
[epoch=  7, iter=  600] loss: 0.829
[epoch=  8, iter=  200] loss: 0.806
[epoch=  8, iter=  400] loss: 0.801
[epoch=  8, iter=  600] loss: 0.783
[epoch=  9, iter=  200] loss: 0.760
[epoch=  9, iter=  400] loss: 0.764
[epoch=  9, iter=  600] loss: 0.762
[epoch= 10, iter=  200] loss

In [None]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # make prediction
        pred = net(img)
        
        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

## 3. Variant 2
Changing the optimizer and learning rate scheduler.

In [3]:
# prepare datasets
# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
    
num_workers = 2
batch_size = 64
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 29.7MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [14]:
# some experimental setup
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
print_every = 200

lr = 3e-4
optim_name = "AdamW"
optim_kwargs = dict(
    lr=lr,
    weight_decay=0.01, # 1e-6 试试0.1/0.01
)

### Results
AdamW lr=3e-4,weight_decay=0.01 scheduler = lr_scheduler.StepLR(optimizer, step_size=15,gamma=0.1) last_loss = 0.464 acc = 83.82%

AdamW lr=3e-4,weight_decay=0.01
constant + cooldown last_loss = 1.015 acc = 68.65%


In [15]:
# our network architecture
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 10),
)

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 7.28M


In [16]:
# the network optimizer
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# loss function
criterion = nn.CrossEntropyLoss()

# the network lr_scheduler
# scheduler = lr_scheduler.StepLR(optimizer, step_size=15,gamma=0.1)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=num_epochs) #或者20？
# constant + cooldown
def update_learn_rate(optimizer, alpha):
    for param_group in optimizer.param_groups:
        param_group['lr'] = alpha

In [17]:
# training loop
net.train()
for epoch in range(num_epochs):
    
    running_loss = 0.0
    alpha = lr
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        # 调整学习率
        if epoch <= 30 and (epoch + 1) % 5 == 0:
            alpha *= 0.98
            update_learn_rate(optimizer, alpha)  # 更新学习率
        elif epoch > 30 and epoch <= 70 and (epoch + 1) % 5 == 0:
            alpha *= 0.95
            update_learn_rate(optimizer, alpha)
        elif epoch > 70 and epoch <= 100 and (epoch + 1) % 5 == 0:
            alpha *= 0.925
            update_learn_rate(optimizer, alpha)
        elif epoch > 100 and (epoch + 1) % 5 == 0:
            alpha *= 0.5
            update_learn_rate(optimizer, alpha)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
    scheduler.step()
print("Finished Training")

[epoch=  1, iter=  200] loss: 2.236
[epoch=  1, iter=  400] loss: 2.015
[epoch=  1, iter=  600] loss: 1.858
[epoch=  2, iter=  200] loss: 1.672
[epoch=  2, iter=  400] loss: 1.597
[epoch=  2, iter=  600] loss: 1.556
[epoch=  3, iter=  200] loss: 1.446
[epoch=  3, iter=  400] loss: 1.410
[epoch=  3, iter=  600] loss: 1.377
[epoch=  4, iter=  200] loss: 1.300
[epoch=  4, iter=  400] loss: 1.266
[epoch=  4, iter=  600] loss: 1.227
[epoch=  5, iter=  200] loss: 1.118
[epoch=  5, iter=  400] loss: 1.119
[epoch=  5, iter=  600] loss: 1.106
[epoch=  6, iter=  200] loss: 1.109
[epoch=  6, iter=  400] loss: 1.095
[epoch=  6, iter=  600] loss: 1.104
[epoch=  7, iter=  200] loss: 1.102
[epoch=  7, iter=  400] loss: 1.097
[epoch=  7, iter=  600] loss: 1.099
[epoch=  8, iter=  200] loss: 1.097
[epoch=  8, iter=  400] loss: 1.101
[epoch=  8, iter=  600] loss: 1.109
[epoch=  9, iter=  200] loss: 1.096
[epoch=  9, iter=  400] loss: 1.090
[epoch=  9, iter=  600] loss: 1.105
[epoch= 10, iter=  200] loss

In [18]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # make prediction
        pred = net(img)
        
        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

Accuracy of the network on the 10000 test images: 68.65%


## 4. Variant 3
Changing the data augementation.  
Ref: https://zhuanlan.zhihu.com/p/49180361

In [4]:
# prepare datasets
# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
    
num_workers = 2
batch_size = 64
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


Files already downloaded and verified
Files already downloaded and verified


In [7]:
# some experimental setup
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
print_every = 200

optim_name = "AdamW"
optim_kwargs = dict(
    lr=3e-4,
    weight_decay=1e-6,
)

In [8]:
# our network architecture
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 10),
)

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 7.28M


In [9]:
# the network optimizer
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# loss function
criterion = nn.CrossEntropyLoss()

# the network lr_scheduler
scheduler = lr_scheduler.StepLR(optimizer, step_size=15,gamma=0.1)

In [None]:
# training loop
net.train()
for epoch in range(num_epochs):
    scheduler.step()
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
    
print("Finished Training")

[epoch=  1, iter=  200] loss: 2.192
[epoch=  1, iter=  400] loss: 1.951
[epoch=  1, iter=  600] loss: 1.837
[epoch=  2, iter=  200] loss: 1.655
[epoch=  2, iter=  400] loss: 1.617
[epoch=  2, iter=  600] loss: 1.533
[epoch=  3, iter=  200] loss: 1.448
[epoch=  3, iter=  400] loss: 1.424
[epoch=  3, iter=  600] loss: 1.388
[epoch=  4, iter=  200] loss: 1.308
[epoch=  4, iter=  400] loss: 1.305
[epoch=  4, iter=  600] loss: 1.256
[epoch=  5, iter=  200] loss: 1.214
[epoch=  5, iter=  400] loss: 1.191
[epoch=  5, iter=  600] loss: 1.142
[epoch=  6, iter=  200] loss: 1.128
[epoch=  6, iter=  400] loss: 1.099
[epoch=  6, iter=  600] loss: 1.081
[epoch=  7, iter=  200] loss: 1.055
[epoch=  7, iter=  400] loss: 1.037
[epoch=  7, iter=  600] loss: 1.026
[epoch=  8, iter=  200] loss: 0.981
[epoch=  8, iter=  400] loss: 0.974
[epoch=  8, iter=  600] loss: 0.981
[epoch=  9, iter=  200] loss: 0.916
[epoch=  9, iter=  400] loss: 0.930
[epoch=  9, iter=  600] loss: 0.918
[epoch= 10, iter=  200] loss

In [None]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # make prediction
        pred = net(img)
        
        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")