In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms as T
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from model import Lenet
from data_loader import get_data_loaders
from utils import save_checkpoint

writer = SummaryWriter("runs/fashion_mnist_experiment")

In [3]:
train_loader, val_loader, test_loader = get_data_loaders(data_dir="../data")

Train size: 48000
Validation size: 12000
Test size: 10000


In [4]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
print(f"Using {device} device")

Using mps device


In [6]:
model = Lenet()
model.layer_summary((1, 1, 28, 28))

conv1      Conv2d          output shape:torch.Size([1, 6, 28, 28])
pool1      AvgPool2d       output shape:torch.Size([1, 6, 14, 14])
conv2      Conv2d          output shape:torch.Size([1, 16, 10, 10])
pool2      AvgPool2d       output shape:torch.Size([1, 16, 5, 5])
flatten    Flatten         output shape:torch.Size([1, 400])
fc1        Linear          output shape:torch.Size([1, 120])
fc2        Linear          output shape:torch.Size([1, 84])
fc3        Linear          output shape:torch.Size([1, 10])


In [7]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
def train(dataloader, model, loss_fn, optimizer, epoch):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Forward pass
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log to Tensorboard
        writer.add_scalar('Loss/train', loss.item(), epoch * len(dataloader) + batch)

        if batch % 100 == 0:
            loss, current = loss.item(), int((batch * len(X)) / size)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [10]:
def evaluate(dataloader, model, loss_fn, epoch, phase="Validation"):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    total_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            total_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    avg_loss = total_loss / num_batches
    accuracy = correct / size
    print(f"{phase} Accuracy: {100*accuracy:.2f}%, Avg loss: {avg_loss:.6f}")

    # Log to TensorBoard
    writer.add_scalar(f'Loss/{phase}', avg_loss, epoch)
    writer.add_scalar(f'Accuracy/{phase}', accuracy, epoch)
    
    return avg_loss, accuracy


In [11]:
epoch = 10

model = model.to(device)

for t in range(epoch):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer, epoch=t)
    eval_loss, eval_acc = evaluate(val_loader, model, loss_fn, epoch=t)
    save_checkpoint(t, model, optimizer, eval_loss)

print("Done!")
writer.close()

Epoch 1
-------------------------------
loss: 2.371521  [    0/48000]
loss: 2.175277  [    0/48000]
loss: 1.334751  [    0/48000]
loss: 1.120082  [    0/48000]
loss: 0.767620  [    0/48000]
loss: 0.920900  [    0/48000]
loss: 1.008110  [    0/48000]
loss: 0.816122  [    0/48000]
Validation Accuracy: 71.71%, Avg loss: 0.772578
Checkpoint saved at checkpoints/lenet_epoch_0.pth
Epoch 2
-------------------------------
loss: 0.682111  [    0/48000]
loss: 0.758710  [    0/48000]
loss: 0.735206  [    0/48000]
loss: 0.795315  [    0/48000]
loss: 0.655404  [    0/48000]
loss: 0.617429  [    0/48000]
loss: 0.540230  [    0/48000]
loss: 0.558959  [    0/48000]
Validation Accuracy: 74.85%, Avg loss: 0.632044
Checkpoint saved at checkpoints/lenet_epoch_1.pth
Epoch 3
-------------------------------
loss: 0.615358  [    0/48000]
loss: 0.470082  [    0/48000]
loss: 0.483386  [    0/48000]
loss: 0.538534  [    0/48000]
loss: 0.638388  [    0/48000]
loss: 0.524421  [    0/48000]
loss: 0.475866  [    0/4

In [12]:
import tensorboard
tensorboard.notebook.list() 

No known TensorBoard instances running.


In [None]:
tensorboard.notebook.start("--logdir=runs/fashion_mnist_experiment")

In [14]:
tensorboard.notebook.display(port=6006)

Selecting TensorBoard with logdir runs/fashion_mnist_experiment (started 0:00:11 ago; port 6006, pid 95494).


In [20]:
import os
os.system("pkill -f tensorboard")

0