In [1]:
import torch
from torchvision import datasets, transforms


path = "data"

cifar10 = datasets.CIFAR10(
    path, train=True, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))
cifar10_val = datasets.CIFAR10(
    path, train=False, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

In [2]:
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']
cifar2 = [(img, label_map[label])
          for img, label in cifar10
          if label in [0, 2]]
cifar2_val = [(img, label_map[label])
              for img, label in cifar10_val
              if label in [0, 2]]

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() 
                      else "cuda" if torch.cuda.is_available() else "cpu")
device

device(type='mps')

In [4]:
import torch.nn.functional as F
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)

        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        self.tanh1 = nn.Tanh()
        self.tanh2 = nn.Tanh()
        self.tanh3 = nn.Tanh()

        self.fc1 = nn.Linear(8*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.pool1(self.tanh1(self.conv1(x)))
        out = self.pool2(self.tanh2(self.conv2(out)))
        out = out.view(-1, 8*8*8)
        out = self.tanh3(self.fc1(out))
        out = self.fc2(out)
        return out

In [5]:
model = Net().to(device)
numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

(18090, [432, 16, 1152, 8, 16384, 32, 64, 2])

In [6]:
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(8*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(F.tanh(self.conv2(out)), 2)
        out = out.view(-1, 8*8*8)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    cifar2, batch_size=64, shuffle=True,
    generator=torch.Generator().manual_seed(42)
)

val_loader = DataLoader(
    cifar2_val, batch_size=64, shuffle=False,
    generator=torch.Generator().manual_seed(42)
)

In [8]:
model = Net()
model(cifar2[0][0])


tensor([[ 0.1874, -0.1851]], grad_fn=<AddmmBackward0>)

In [9]:
import datetime
import torch.optim as optim

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, Train Loss {train_loss / len(train_loader)}")


2025-08-12 10:12:30.909250 Epoch 1, Train Loss 0.5918112795823699
2025-08-12 10:12:34.735330 Epoch 10, Train Loss 0.3288821424268613
2025-08-12 10:12:38.581586 Epoch 20, Train Loss 0.28984743242810485
2025-08-12 10:12:42.248288 Epoch 30, Train Loss 0.2649611172023093
2025-08-12 10:12:46.196647 Epoch 40, Train Loss 0.24246223940021672
2025-08-12 10:12:50.062158 Epoch 50, Train Loss 0.22538514359361805
2025-08-12 10:12:53.752036 Epoch 60, Train Loss 0.20812008311604238
2025-08-12 10:12:57.316554 Epoch 70, Train Loss 0.19473780682132502
2025-08-12 10:13:01.320570 Epoch 80, Train Loss 0.18073457821159605
2025-08-12 10:13:05.123898 Epoch 90, Train Loss 0.16830869837646273
2025-08-12 10:13:08.995319 Epoch 100, Train Loss 0.15329014974984395


In [10]:
def accuracy(model: nn.Module, criterion, optimizer, data_loader: DataLoader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for imgs, labels in data_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)

            pred = torch.argmax(outputs, dim=1)

            total += labels.shape[0]
            correct += (pred == labels).sum()
        print(correct / total)


In [11]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9431, device='mps:0')
tensor(0.8970, device='mps:0')


In [12]:
torch.save(model.state_dict(), "data/birds_vs_airplanes.pt")

In [13]:
model = Net()
model.load_state_dict(torch.load("data/birds_vs_airplanes.pt"))

<All keys matched successfully>

In [14]:
class NetWidth(nn.Module):
    def __init__(self, channel: int):
        super().__init__()

        self.conv1 = nn.Conv2d(3, channel, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(channel, 16, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(16*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(F.tanh(self.conv2(out)), 2)
        out = out.view(-1, 16*8*8)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [15]:
model = NetWidth(32).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum(p.pow(2.0).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 10:13:10.090672 Epoch 1,  Train Loss 0.5600710063223626
2025-08-12 10:13:15.603009 Epoch 10,  Train Loss 0.35240418544620467
2025-08-12 10:13:21.653633 Epoch 20,  Train Loss 0.3079237469062684
2025-08-12 10:13:27.704296 Epoch 30,  Train Loss 0.27679746792574594
2025-08-12 10:13:33.661900 Epoch 40,  Train Loss 0.2527210355564288
2025-08-12 10:13:39.728241 Epoch 50,  Train Loss 0.22979798319802922
2025-08-12 10:13:45.506576 Epoch 60,  Train Loss 0.21202301988556127
2025-08-12 10:13:51.292396 Epoch 70,  Train Loss 0.1980568641310285
2025-08-12 10:13:57.208883 Epoch 80,  Train Loss 0.182036443358394
2025-08-12 10:14:03.161061 Epoch 90,  Train Loss 0.17018896664024158
2025-08-12 10:14:09.046950 Epoch 100,  Train Loss 0.15694105686845294


In [16]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9615, device='mps:0')
tensor(0.9020, device='mps:0')


In [17]:
model = NetWidth(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 10:14:10.050920 Epoch 1,  Train Loss 0.5727217782075238
2025-08-12 10:14:16.536800 Epoch 10,  Train Loss 0.34687424512805454
2025-08-12 10:14:23.613457 Epoch 20,  Train Loss 0.3023368180937068
2025-08-12 10:14:30.653289 Epoch 30,  Train Loss 0.27167767476124366
2025-08-12 10:14:37.799678 Epoch 40,  Train Loss 0.24472511270243652
2025-08-12 10:14:45.029960 Epoch 50,  Train Loss 0.22415560426985381
2025-08-12 10:14:52.040430 Epoch 60,  Train Loss 0.20565528237515954
2025-08-12 10:14:59.388173 Epoch 70,  Train Loss 0.18950387275522682
2025-08-12 10:15:06.838876 Epoch 80,  Train Loss 0.17461752312578213
2025-08-12 10:15:14.092162 Epoch 90,  Train Loss 0.15920183304578636
2025-08-12 10:15:21.303663 Epoch 100,  Train Loss 0.14840538106906187


In [18]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9554, device='mps:0')
tensor(0.8895, device='mps:0')


In [19]:
class NetDropout(nn.Module):
    def __init__(self, channel_size: int):
        super().__init__()
        self.channel_size = channel_size

        self.conv1 = nn.Conv2d(3, self.channel_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.channel_size, self.channel_size // 2, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.channel_size*8*8 // 2, 32)
        self.fc2 = nn.Linear(32, 2)

        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        out = F.tanh(self.conv1(x))
        out = F.max_pool2d(self.dropout(out), kernel_size=2)
        out = F.tanh(self.conv2(out))
        out = F.max_pool2d(self.dropout(out), kernel_size=2)
        out = out.view(-1, self.channel_size*8*8 // 2)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [20]:
model = NetDropout(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 10:15:22.549906 Epoch 1,  Train Loss 0.590435059397084
2025-08-12 10:15:28.979867 Epoch 10,  Train Loss 0.35816009333179255
2025-08-12 10:15:35.767618 Epoch 20,  Train Loss 0.31502072826312605
2025-08-12 10:15:42.830229 Epoch 30,  Train Loss 0.28279403174758716
2025-08-12 10:15:49.896384 Epoch 40,  Train Loss 0.26404883294917975
2025-08-12 10:15:57.226014 Epoch 50,  Train Loss 0.2407248307755039
2025-08-12 10:16:04.591848 Epoch 60,  Train Loss 0.22944018083393194
2025-08-12 10:16:11.667756 Epoch 70,  Train Loss 0.21601320262167864
2025-08-12 10:16:18.470614 Epoch 80,  Train Loss 0.1991027661474647
2025-08-12 10:16:25.703233 Epoch 90,  Train Loss 0.18825936417101294
2025-08-12 10:16:32.320858 Epoch 100,  Train Loss 0.18021356409332553


In [21]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.8479, device='mps:0')
tensor(0.7900, device='mps:0')


In [22]:
class NetBatchNorm(nn.Module):
    def __init__(self, channel_size: int):
        super().__init__()
        self.channel_size = channel_size

        self.conv1 = nn.Conv2d(3, self.channel_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.channel_size, self.channel_size // 2, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.channel_size*8*8//2, 32)
        self.fc2 = nn.Linear(32, 2)

        self.batch_norm1 = nn.BatchNorm2d(self.channel_size)
        self.batch_norm2 = nn.BatchNorm2d(self.channel_size // 2)

    def forward(self, x):
        out = self.batch_norm1(self.conv1(x))
        out = F.max_pool2d(F.tanh(out), 2)
        out = self.batch_norm2(self.conv2(out))
        out = F.max_pool2d(F.tanh(out), 2)
        out = out.view(-1, self.channel_size*8*8//2)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [23]:
model = NetBatchNorm(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 10:16:33.704816 Epoch 1,  Train Loss 0.5765107570180468
2025-08-12 10:16:41.715743 Epoch 10,  Train Loss 0.38334842757054954
2025-08-12 10:16:50.858252 Epoch 20,  Train Loss 0.3113369100792393
2025-08-12 10:16:59.686001 Epoch 30,  Train Loss 0.25396672470174775
2025-08-12 10:17:09.141269 Epoch 40,  Train Loss 0.21266671806384044
2025-08-12 10:17:17.893540 Epoch 50,  Train Loss 0.17404496821628254
2025-08-12 10:17:26.437496 Epoch 60,  Train Loss 0.1524861442625143
2025-08-12 10:17:35.221805 Epoch 70,  Train Loss 0.15110270707470597
2025-08-12 10:17:44.127946 Epoch 80,  Train Loss 0.12680904529276926
2025-08-12 10:17:53.661436 Epoch 90,  Train Loss 0.11981440468388758
2025-08-12 10:18:02.285013 Epoch 100,  Train Loss 0.11448996893729374


In [24]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9975, device='mps:0')
tensor(0.8810, device='mps:0')


In [26]:
class NetDepth(nn.Module):
    def __init__(self, size_channel):
        super().__init__()
        self.size_channel = size_channel

        self.conv1 = nn.Conv2d(3, self.size_channel,
                               kernel_size=3, padding=1)  # これによってH_out, W_outはもとのH, Wと同じになる
        self.conv2 = nn.Conv2d(self.size_channel, self.size_channel // 2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(self.size_channel // 2, self.size_channel // 4, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.size_channel*4*4//4, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.relu(self.conv1(x)), kernel_size=2)
        out = F.max_pool2d(F.relu(self.conv2(out)), kernel_size=2)
        out = F.max_pool2d(F.relu(self.conv3(out)), kernel_size=2)
        out = out.view(-1, 4*4*self.size_channel // 4)
        out = F.relu(self.fc1(out))
        out = F.relu(out)
        return out

In [46]:
model = NetDepth(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 11:22:15.356326 Epoch 1,  Train Loss 0.9985493649343017
2025-08-12 11:22:25.478655 Epoch 10,  Train Loss 0.390664311446202
2025-08-12 11:22:36.541396 Epoch 20,  Train Loss 0.3433063129900367
2025-08-12 11:22:47.626836 Epoch 30,  Train Loss 0.3149289628312846
2025-08-12 11:22:58.683283 Epoch 40,  Train Loss 0.2869580114723011
2025-08-12 11:23:09.709388 Epoch 50,  Train Loss 0.2678053154117742
2025-08-12 11:23:21.017933 Epoch 60,  Train Loss 0.24629823284544003
2025-08-12 11:23:32.261971 Epoch 70,  Train Loss 0.23102516486386585
2025-08-12 11:23:43.430490 Epoch 80,  Train Loss 0.21284462312224564
2025-08-12 11:23:54.621229 Epoch 90,  Train Loss 0.20501488580065927
2025-08-12 11:24:05.754583 Epoch 100,  Train Loss 0.18480163760435808


In [47]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9315, device='mps:0')
tensor(0.8900, device='mps:0')


In [32]:
class NetRes(nn.Module):
    def __init__(self, size_channel):
        super().__init__()
        self.size_channel = size_channel

        self.conv1 = nn.Conv2d(3, self.size_channel,
                               kernel_size=3, padding=1)  # これによってH_out, W_outはもとのH, Wと同じになる
        self.conv2 = nn.Conv2d(self.size_channel, self.size_channel // 2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(self.size_channel // 2, self.size_channel // 2, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.size_channel*4*4//2, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.relu(self.conv1(x)), kernel_size=2)
        out = F.max_pool2d(F.relu(self.conv2(out)), kernel_size=2)
        out = F.max_pool2d(F.relu(self.conv3(out)) + out, kernel_size=2)
        out = out.view(-1, 4*4*self.size_channel // 2)
        out = F.relu(self.fc1(out))
        out = F.relu(out)
        return out

In [33]:
model = NetRes(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 10:36:50.396431 Epoch 1,  Train Loss 0.7249555359980103
2025-08-12 10:36:57.860998 Epoch 10,  Train Loss 0.36252541375008357
2025-08-12 10:37:05.783859 Epoch 20,  Train Loss 0.3170982123750031
2025-08-12 10:37:13.336205 Epoch 30,  Train Loss 0.2867797552400334
2025-08-12 10:37:20.966451 Epoch 40,  Train Loss 0.25437080338122736
2025-08-12 10:37:29.817282 Epoch 50,  Train Loss 0.23323178993668525
2025-08-12 10:37:38.361251 Epoch 60,  Train Loss 0.2123444275871204
2025-08-12 10:37:46.701598 Epoch 70,  Train Loss 0.19446334560775452
2025-08-12 10:37:55.264156 Epoch 80,  Train Loss 0.18002608878787157
2025-08-12 10:38:03.379643 Epoch 90,  Train Loss 0.16844888482314008
2025-08-12 10:38:11.268166 Epoch 100,  Train Loss 0.14966326040826786


In [34]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9751, device='mps:0')
tensor(0.9045, device='mps:0')


In [41]:
class ResBlock(nn.Module):
    def __init__(self, channel_size):
        super().__init__()

        self.conv = nn.Conv2d(channel_size, channel_size, kernel_size=3, padding=1, bias=False)

        self.batch_norm = nn.BatchNorm2d(channel_size)
        torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity="relu")
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)

    def forward(self, x):
        out = self.conv(x)
        out = self.batch_norm(out)
        out = F.relu(out)
        return out + x

In [42]:
class NetResDeep(nn.Module):
    def __init__(self, channel_size, block_size):
        super().__init__()
        self.channel_size = channel_size

        self.conv1 = nn.Conv2d(3, channel_size, kernel_size=3, padding=1)

        self.resblocks = nn.Sequential(
            *(block_size * [ResBlock(channel_size)])
        )

        self.fc1 = nn.Linear(8*8*channel_size, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.relu(self.conv1(x)), 2)
        out = self.resblocks(out)
        out = F.max_pool2d(out, 2)
        out = out.view(-1, 8*8*self.channel_size)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out

In [44]:
model = NetResDeep(64, 10).to(device)
optimizer = optim.SGD(model.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-12 11:17:48.034752 Epoch 1,  Train Loss 0.6660507004352132
2025-08-12 11:18:05.203418 Epoch 10,  Train Loss 0.41646351992704306
2025-08-12 11:18:24.165075 Epoch 20,  Train Loss 0.33987804108364567
2025-08-12 11:18:43.115134 Epoch 30,  Train Loss 0.25750148059076566
2025-08-12 11:19:02.118823 Epoch 40,  Train Loss 0.2139415982042908
2025-08-12 11:19:21.097710 Epoch 50,  Train Loss 0.19840348374312092
2025-08-12 11:19:40.235077 Epoch 60,  Train Loss 0.17599982554745522
2025-08-12 11:19:59.719357 Epoch 70,  Train Loss 0.2034333986081895
2025-08-12 11:20:18.776717 Epoch 80,  Train Loss 0.1729945426533936
2025-08-12 11:20:37.739307 Epoch 90,  Train Loss 0.1647917657710944
2025-08-12 11:20:56.671001 Epoch 100,  Train Loss 0.1604459656841436


In [45]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.8153, device='mps:0')
tensor(0.7975, device='mps:0')


In [49]:
help(nn.BatchNorm2d)

Help on class BatchNorm2d in module torch.nn.modules.batchnorm:

class BatchNorm2d(_BatchNorm)
 |  BatchNorm2d(num_features: int, eps: float = 1e-05, momentum: Optional[float] = 0.1, affine: bool = True, track_running_stats: bool = True, device=None, dtype=None) -> None
 |
 |  Applies Batch Normalization over a 4D input.
 |
 |  4D is a mini-batch of 2D inputs
 |  with additional channel dimension. Method described in the paper
 |  `Batch Normalization: Accelerating Deep Network Training by Reducing
 |  Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
 |
 |  .. math::
 |
 |      y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
 |
 |  The mean and standard-deviation are calculated per-dimension over
 |  the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
 |  of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
 |  to 1 and the elements of :math:`\beta` are set to 0.