Notebook for running MNIST classifier, template for working with the cluster

In [6]:
import numpy as np
import time
import torch
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets, transforms
from torch import nn, optim
import gzip

In [11]:
!nvidia-smi 

Mon Sep  6 10:05:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM3...  On   | 00000000:1E:00.0 Off |                    0 |
| N/A   48C    P0   174W / 350W |  15086MiB / 32510MiB |     98%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:23:00.0 Off |                    0 |
| N/A   40C    P0   285W / 350W |   8395MiB / 32510MiB |     93%      Default |
|       

In [12]:
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
torch.torch.cuda.set_device(device)
print("Using device " + str(torch.torch.cuda.current_device()) + "/" + str(torch.cuda.device_count())
      +", name: " + str(torch.cuda.get_device_name(0)))
#device = torch.torch.cuda.current_device()

Using device 6/16, name: Tesla V100-SXM3-32GB


In [3]:
# Needed for loading data
#%pip install idx2numpy
#%pip install ipywidgets
#%pip install tensorboard

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Load data
import idx2numpy
data_dir = "/home/arvid/data/mnist/"
train_imgs = idx2numpy.convert_from_file(data_dir+"train-images-idx3-ubyte")
train_labels = idx2numpy.convert_from_file(data_dir+"train-labels-idx1-ubyte")
test_imgs = idx2numpy.convert_from_file(data_dir+"t10k-images-idx3-ubyte")
test_labels = idx2numpy.convert_from_file(data_dir+"t10k-labels-idx1-ubyte")

# Convert to torch tensors
train_imgs = torch.tensor(train_imgs, dtype=torch.float64)/255
test_imgs = torch.tensor(test_imgs, dtype=torch.float64)/255
train_labels = torch.tensor(train_labels,dtype=torch.int32)
test_labels = torch.tensor(test_labels,dtype=torch.int32)

# Shuffle entire train set
if True:
    perm = torch.randperm(train_imgs.shape[0])
    train_imgs, train_labels = train_imgs[perm,:,:], train_labels[perm]

# Place in dict, use 70/30 train eval split
num_train = int(train_imgs.shape[0]*0.7)
data = {"train": {"images": train_imgs[:num_train], "labels": train_labels[:num_train]},
        "eval": {"images": train_imgs[num_train:], "labels": train_labels[num_train:]},
        "test": {"images": test_imgs, "labels": test_labels}}



In [14]:
# Define ImageDataset Class

from torch.utils.data import Dataset
from torchvision.transforms import Compose, Resize, ToTensor, RandomHorizontalFlip, RandomRotation, ToPILImage
class ImageDataset(Dataset):

    def __init__(self, data, split, im_size=28):
        self.split = split
        self.data = data[self.split]
        
        if split == "train":

            self.transform = Compose([
                Resize((im_size, im_size)),
                RandomHorizontalFlip(p=0.5),
                RandomRotation(degrees=45),
            ])
         
        elif self.split == "eval" or split == "test":
            self.transform = Compose([
                Resize((im_size, im_size)),
            ])
        
        
        if split == 'train':
            self.transform = Compose([
                Resize((im_size, im_size)),
                RandomHorizontalFlip(p=0.5),
                RandomRotation(degrees=45),
                ToTensor(),
            ])

        elif split == 'eval':
            self.transform = Compose([
                Resize((im_size, im_size)),
                ToTensor(),
            ])

        elif split == 'test':
            self.transform = Compose([
                Resize((im_size, im_size)),
                ToTensor(),
            ])

    def __len__(self):
        return self.data["images"].shape[0]

    def __getitem__(self, index):
        return {
            'image': self.data["images"][index,:,:].clone().detach().unsqueeze(0).tile([3,1,1]).to(dtype=torch.float),
            'label': self.data["labels"][index].clone().detach()
        }

In [15]:
# Prepare dataset & dataloader

from torch.utils.data import DataLoader

datasets = {}
dataloaders = {}
for split in ['train', 'eval', 'test']:
    datasets[split] = ImageDataset(data=data,
                                   split=split)

    dataloaders[split] = DataLoader(datasets[split],
                                    batch_size=32,
                                    shuffle=(split != 'test'),
                                    num_workers=4,
                                    pin_memory=False)# Was True before.

In [16]:
# Define model class

from torch import nn
from torchvision import models


class Net(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        model = models.resnet18(pretrained=True, progress=True)
        #print(model)

        # Adapt last layer to multi-label task
        model.fc = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=model.fc.in_features, out_features=128),
            nn.Linear(in_features=128, out_features=n_classes)
        )

        self.base_model = model
        self.sigm = nn.Sigmoid()


    def forward(self, x):
        return self.sigm(self.base_model(x))

In [17]:
# Define Trainer class

import time
import tqdm
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
class Trainer():

    @staticmethod
    def train_model(model, dataloaders, objective, optimizer, device, run, num_epochs=25):

        # Prepare
        since = time.time()
        best_model_wts = model.state_dict()
        best_loss = 999999
        running_losses = {'train': [], 'eval': []}
        
        # Loop
        writer = SummaryWriter('runs/mnist/'+run+"/")
        for epoch in range(num_epochs):

            # Progress
            print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
            print('-' * 10)

            for split in ['train', 'eval']:
                torch.set_grad_enabled(split == 'train')
                model.train(split=='train')
                total_loss = 0.0

                # Load and train/eval on split
                for data in tqdm.tqdm(dataloaders[split],colour='ffffff'):

                    # Put batch on device
                    inputs, labels = data['image'].to(device), data['label'].to(device, dtype=torch.int64)
                    labels = F.one_hot(labels, num_classes=10).to(dtype=torch.float)

                    # Forward and calc loss
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    
                    loss = objective(outputs, labels)
                    # Backprop if train
                    if split == 'train':
                        loss.backward()
                        optimizer.step()

                    total_loss += loss.item()
                
                
                
                # Record
                epoch_loss = total_loss / dataloaders[split].dataset.__len__()
                running_losses[split].append(epoch_loss)
                print('{} Loss: {:.4f}'.format(split, epoch_loss))
                writer.add_scalar('Loss/'+split, epoch_loss, epoch+1)
                
                # Deep copy if best loss
                if split == 'eval' and epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_model_wts = model.state_dict()



        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best Validation Loss: {:4f}'.format(best_loss))


        # load best model weights
        model.load_state_dict(best_model_wts)
        return model

In [18]:
# Init model
model = Net(n_classes=10).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
objective = torch.nn.BCELoss()

trained_model = Trainer.train_model(model=model,
                                    dataloaders=dataloaders,
                                    objective=objective,
                                    optimizer=optimizer,
                                    device=device,
                                    run="name_of_run",
                                    num_epochs=25)



Epoch 0/24
----------


  self.colour = colour
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
100%|███████████████████████████████████████| 1313/1313 [01:01<00:00, 21.49it/s]


train Loss: 0.0034


100%|█████████████████████████████████████████| 563/563 [00:07<00:00, 79.53it/s]


eval Loss: 0.0004

Epoch 1/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:20<00:00, 16.25it/s]


train Loss: 0.0006


100%|█████████████████████████████████████████| 563/563 [00:07<00:00, 71.53it/s]


eval Loss: 0.0003

Epoch 2/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:03<00:00, 20.78it/s]


train Loss: 0.0004


100%|█████████████████████████████████████████| 563/563 [00:07<00:00, 71.94it/s]


eval Loss: 0.0002

Epoch 3/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:15<00:00, 17.29it/s]


train Loss: 0.0003


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 66.02it/s]


eval Loss: 0.0002

Epoch 4/24
----------


100%|███████████████████████████████████████| 1313/1313 [00:59<00:00, 21.88it/s]


train Loss: 0.0002


100%|█████████████████████████████████████████| 563/563 [00:07<00:00, 77.02it/s]


eval Loss: 0.0002

Epoch 5/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:24<00:00, 15.62it/s]


train Loss: 0.0001


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 64.34it/s]


eval Loss: 0.0002

Epoch 6/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:00<00:00, 21.59it/s]


train Loss: 0.0001


100%|█████████████████████████████████████████| 563/563 [00:06<00:00, 82.57it/s]


eval Loss: 0.0002

Epoch 7/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:12<00:00, 18.05it/s]


train Loss: 0.0001


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 61.92it/s]


eval Loss: 0.0002

Epoch 8/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:19<00:00, 16.59it/s]


train Loss: 0.0001


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 64.70it/s]


eval Loss: 0.0002

Epoch 9/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:49<00:00, 12.01it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 62.05it/s]


eval Loss: 0.0002

Epoch 10/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:25<00:00, 15.36it/s]


train Loss: 0.0001


100%|█████████████████████████████████████████| 563/563 [00:11<00:00, 48.08it/s]


eval Loss: 0.0001

Epoch 11/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:44<00:00, 12.58it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 62.62it/s]


eval Loss: 0.0001

Epoch 12/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:37<00:00, 13.43it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:11<00:00, 47.83it/s]


eval Loss: 0.0001

Epoch 13/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:32<00:00, 14.23it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 61.48it/s]


eval Loss: 0.0001

Epoch 14/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:46<00:00, 12.35it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:10<00:00, 51.49it/s]


eval Loss: 0.0001

Epoch 15/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:24<00:00, 15.56it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 57.02it/s]


eval Loss: 0.0001

Epoch 16/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:47<00:00, 12.25it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 62.43it/s]


eval Loss: 0.0002

Epoch 17/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:30<00:00, 14.47it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:11<00:00, 47.51it/s]


eval Loss: 0.0001

Epoch 18/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:39<00:00, 13.25it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 62.22it/s]


eval Loss: 0.0001

Epoch 19/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:44<00:00, 12.54it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:12<00:00, 46.18it/s]


eval Loss: 0.0001

Epoch 20/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:27<00:00, 14.97it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 61.80it/s]


eval Loss: 0.0001

Epoch 21/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:23<00:00, 15.71it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:09<00:00, 61.13it/s]


eval Loss: 0.0002

Epoch 22/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:24<00:00, 15.58it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 64.66it/s]


eval Loss: 0.0001

Epoch 23/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:23<00:00, 15.67it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 63.23it/s]


eval Loss: 0.0002

Epoch 24/24
----------


100%|███████████████████████████████████████| 1313/1313 [01:24<00:00, 15.62it/s]


train Loss: 0.0000


100%|█████████████████████████████████████████| 563/563 [00:08<00:00, 65.03it/s]

eval Loss: 0.0002
Training complete in 39m 35s
Best Validation Loss: 0.000133





In [None]:
import os
torch.save(model.state_dict(),"/home/arvid/models/mnist/model_weights")

In [2]:
%pip install tensorboard

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
%load_ext tensorboard
%tensorboard --logdir /home/arvid/code/mnist_test/runs/can_specify/

Reusing TensorBoard on port 6006 (pid 81658), started 2 days, 18:05:31 ago. (Use '!kill 81658' to kill it.)