In [1]:
import torch
from torchvision import datasets, transforms


path = "data"

cifar10 = datasets.CIFAR10(
    path, train=True, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))
cifar10_val = datasets.CIFAR10(
    path, train=False, download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

In [None]:
transforms.Normalize()

In [2]:
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']
cifar2 = [(img, label_map[label])
          for img, label in cifar10
          if label in [0, 2]]
cifar2_val = [(img, label_map[label])
              for img, label in cifar10_val
              if label in [0, 2]]

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() 
                      else "cuda" if torch.cuda.is_available() else "cpu")
device

device(type='mps')

In [4]:
import torch.nn.functional as F
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)

        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        self.tanh1 = nn.Tanh()
        self.tanh2 = nn.Tanh()
        self.tanh3 = nn.Tanh()

        self.fc1 = nn.Linear(8*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.pool1(self.tanh1(self.conv1(x)))
        out = self.pool2(self.tanh2(self.conv2(out)))
        out = out.view(-1, 8*8*8)
        out = self.tanh3(self.fc1(out))
        out = self.fc2(out)
        return out

In [5]:
model = Net().to(device)
numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

(18090, [432, 16, 1152, 8, 16384, 32, 64, 2])

In [6]:
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(8*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(F.tanh(self.conv2(out)), 2)
        out = out.view(-1, 8*8*8)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [52]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    cifar2, batch_size=64, shuffle=True,
    generator=torch.Generator().manual_seed(42)
)

val_loader = DataLoader(
    cifar2_val, batch_size=64, shuffle=False,
    generator=torch.Generator().manual_seed(42)
)

In [53]:
model = Net()
model(cifar2[0][0])


tensor([[-0.1049, -0.0315]], grad_fn=<AddmmBackward0>)

In [54]:
import datetime
import torch.optim as optim

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, Train Loss {train_loss / len(train_loader)}")


2025-08-11 21:51:33.707841 Epoch 1, Train Loss 0.5703767980359922
2025-08-11 21:51:37.456472 Epoch 10, Train Loss 0.34042285325800536
2025-08-11 21:51:41.235358 Epoch 20, Train Loss 0.2998743922847092
2025-08-11 21:51:45.366739 Epoch 30, Train Loss 0.276204632251126
2025-08-11 21:51:49.581746 Epoch 40, Train Loss 0.253474707910969
2025-08-11 21:51:53.803373 Epoch 50, Train Loss 0.23362333564811452
2025-08-11 21:51:58.104837 Epoch 60, Train Loss 0.21777245549449495
2025-08-11 21:52:02.358042 Epoch 70, Train Loss 0.20558011517593056
2025-08-11 21:52:06.566790 Epoch 80, Train Loss 0.1929951187958763
2025-08-11 21:52:10.959438 Epoch 90, Train Loss 0.18141046130828037
2025-08-11 21:52:15.266507 Epoch 100, Train Loss 0.16724360046113373


In [55]:
def accuracy(model: nn.Module, criterion, optimizer, data_loader: DataLoader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for imgs, labels in data_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)

            pred = torch.argmax(outputs, dim=1)

            total += labels.shape[0]
            correct += (pred == labels).sum()
        print(correct / total)


In [56]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9355, device='mps:0')
tensor(0.8895, device='mps:0')


In [57]:
torch.save(model.state_dict(), "data/birds_vs_airplanes.pt")

In [58]:
model = Net()
model.load_state_dict(torch.load("data/birds_vs_airplanes.pt"))

<All keys matched successfully>

In [63]:
class NetWidth(nn.Module):
    def __init__(self, channel: int):
        super().__init__()

        self.conv1 = nn.Conv2d(3, channel, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(channel, 16, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(16*8*8, 32)
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = F.max_pool2d(F.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(F.tanh(self.conv2(out)), 2)
        out = out.view(-1, 16*8*8)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [64]:
model = NetWidth(32).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum(p.pow(2.0).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-11 21:54:00.303983 Epoch 1,  Train Loss 0.5841960660211599
2025-08-11 21:54:06.021937 Epoch 10,  Train Loss 0.3507759719137933
2025-08-11 21:54:12.304698 Epoch 20,  Train Loss 0.3119001829889929
2025-08-11 21:54:18.666029 Epoch 30,  Train Loss 0.28346171481594157
2025-08-11 21:54:25.084196 Epoch 40,  Train Loss 0.2611427181845258
2025-08-11 21:54:31.294888 Epoch 50,  Train Loss 0.237025527485237
2025-08-11 21:54:37.501029 Epoch 60,  Train Loss 0.22267879801950638
2025-08-11 21:54:43.756725 Epoch 70,  Train Loss 0.20558584524188073
2025-08-11 21:54:49.953732 Epoch 80,  Train Loss 0.18976376499909503
2025-08-11 21:54:56.274946 Epoch 90,  Train Loss 0.17583720343317955
2025-08-11 21:55:02.709130 Epoch 100,  Train Loss 0.1636143454415783


In [65]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9565, device='mps:0')
tensor(0.8965, device='mps:0')


In [67]:
model = NetWidth(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-11 21:56:12.564025 Epoch 1,  Train Loss 0.5729605356219468
2025-08-11 21:56:18.802191 Epoch 10,  Train Loss 0.34454719456517774
2025-08-11 21:56:25.843951 Epoch 20,  Train Loss 0.3066430930878706
2025-08-11 21:56:32.771288 Epoch 30,  Train Loss 0.2768849319523307
2025-08-11 21:56:39.772623 Epoch 40,  Train Loss 0.25399489710285406
2025-08-11 21:56:46.714423 Epoch 50,  Train Loss 0.23087473118760785
2025-08-11 21:56:53.765499 Epoch 60,  Train Loss 0.21044086370688336
2025-08-11 21:57:00.791801 Epoch 70,  Train Loss 0.1935175801538358
2025-08-11 21:57:07.686752 Epoch 80,  Train Loss 0.17906437249510151
2025-08-11 21:57:14.778193 Epoch 90,  Train Loss 0.16627111357108804
2025-08-11 21:57:21.588903 Epoch 100,  Train Loss 0.15376685184847777


In [68]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9644, device='mps:0')
tensor(0.8925, device='mps:0')


In [77]:
class NetDropout(nn.Module):
    def __init__(self, channel_size: int):
        super().__init__()
        self.channel_size = channel_size

        self.conv1 = nn.Conv2d(3, self.channel_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.channel_size, self.channel_size // 2, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.channel_size*8*8 // 2, 32)
        self.fc2 = nn.Linear(32, 2)

        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        out = F.tanh(self.conv1(x))
        out = F.max_pool2d(self.dropout(out), kernel_size=2)
        out = F.tanh(self.conv2(out))
        out = F.max_pool2d(self.dropout(out), kernel_size=2)
        out = out.view(-1, self.channel_size*8*8 // 2)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [78]:
model = NetDropout(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-11 22:33:34.617346 Epoch 1,  Train Loss 0.5741230517056337
2025-08-11 22:33:40.711948 Epoch 10,  Train Loss 0.35681581895822173
2025-08-11 22:33:47.319737 Epoch 20,  Train Loss 0.3113867296914386
2025-08-11 22:33:54.002521 Epoch 30,  Train Loss 0.28294802234051336
2025-08-11 22:34:00.713004 Epoch 40,  Train Loss 0.2621251658364466
2025-08-11 22:34:07.335212 Epoch 50,  Train Loss 0.24592957878188723
2025-08-11 22:34:14.004562 Epoch 60,  Train Loss 0.22858043105169468
2025-08-11 22:34:20.535402 Epoch 70,  Train Loss 0.21547605097293854
2025-08-11 22:34:27.172476 Epoch 80,  Train Loss 0.1968964502500121
2025-08-11 22:34:33.959354 Epoch 90,  Train Loss 0.18962460694609173
2025-08-11 22:34:40.603311 Epoch 100,  Train Loss 0.17678017254657807


In [79]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.8803, device='mps:0')
tensor(0.8520, device='mps:0')


In [84]:
class NetBatchNorm(nn.Module):
    def __init__(self, channel_size: int):
        super().__init__()
        self.channel_size = channel_size

        self.conv1 = nn.Conv2d(3, self.channel_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.channel_size, self.channel_size // 2, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(self.channel_size*8*8//2, 32)
        self.fc2 = nn.Linear(32, 2)

        self.batch_norm1 = nn.BatchNorm2d(self.channel_size)
        self.batch_norm2 = nn.BatchNorm2d(self.channel_size // 2)

    def forward(self, x):
        out = self.batch_norm1(self.conv1(x))
        out = F.max_pool2d(F.tanh(out), 2)
        out = self.batch_norm2(self.conv2(out))
        out = F.max_pool2d(F.tanh(out), 2)
        out = out.view(-1, self.channel_size*8*8//2)
        out = F.tanh(self.fc1(out))
        out = self.fc2(out)
        return out

In [85]:
model = NetBatchNorm(64).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    for imgs, labels in train_loader:
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)

        loss = criterion(outputs, labels)
        l2_norm = sum((p ** 2).sum()
                      for p in model.parameters())
        loss = loss + 0.001*l2_norm
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f"{datetime.datetime.now()} Epoch {epoch + 1}, ",
                f"Train Loss {train_loss / len(train_loader)}")

2025-08-11 22:51:13.083200 Epoch 1,  Train Loss 0.5621445676323714
2025-08-11 22:51:21.040803 Epoch 10,  Train Loss 0.38095596916736313
2025-08-11 22:51:29.713741 Epoch 20,  Train Loss 0.30905323746098073
2025-08-11 22:51:38.223359 Epoch 30,  Train Loss 0.2510499076288977
2025-08-11 22:51:46.658219 Epoch 40,  Train Loss 0.20655566150215782
2025-08-11 22:51:56.081034 Epoch 50,  Train Loss 0.17033429206556575
2025-08-11 22:52:05.339261 Epoch 60,  Train Loss 0.1514132310440586
2025-08-11 22:52:13.846107 Epoch 70,  Train Loss 0.15066134236800444
2025-08-11 22:52:22.374216 Epoch 80,  Train Loss 0.12592637790426328
2025-08-11 22:52:30.960123 Epoch 90,  Train Loss 0.12960255843628743
2025-08-11 22:52:39.659727 Epoch 100,  Train Loss 0.19561852434068727


In [86]:
accuracy(model, criterion, optimizer, train_loader)
accuracy(model, criterion, optimizer, val_loader)

tensor(0.9581, device='mps:0')
tensor(0.8725, device='mps:0')
