Notebook for running MNIST classifier, template for working with the cluster

In [None]:
import numpy as np
import time
import torch
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets, transforms
from torch import nn, optim
import gzip

In [None]:
!nvidia-smi 

In [None]:
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
torch.torch.cuda.set_device(device)
print("Using device " + str(torch.torch.cuda.current_device()) + "/" + str(torch.cuda.device_count())
      +", name: " + str(torch.cuda.get_device_name(0)))
#device = torch.torch.cuda.current_device()

In [None]:
# Needed for loading data
#%pip install idx2numpy
#%pip install ipywidgets
#%pip install tensorboard

In [None]:
# Load data
import idx2numpy
data_dir = "/home/arvid/data/mnist/"
train_imgs = idx2numpy.convert_from_file(data_dir+"train-images-idx3-ubyte")
train_labels = idx2numpy.convert_from_file(data_dir+"train-labels-idx1-ubyte")
test_imgs = idx2numpy.convert_from_file(data_dir+"t10k-images-idx3-ubyte")
test_labels = idx2numpy.convert_from_file(data_dir+"t10k-labels-idx1-ubyte")

# Convert to torch tensors
train_imgs = torch.tensor(train_imgs, dtype=torch.float64)/255
test_imgs = torch.tensor(test_imgs, dtype=torch.float64)/255
train_labels = torch.tensor(train_labels,dtype=torch.int32)
test_labels = torch.tensor(test_labels,dtype=torch.int32)

# Shuffle entire train set
if True:
    perm = torch.randperm(train_imgs.shape[0])
    train_imgs, train_labels = train_imgs[perm,:,:], train_labels[perm]

# Place in dict, use 70/30 train eval split
num_train = int(train_imgs.shape[0]*0.7)
data = {"train": {"images": train_imgs[:num_train], "labels": train_labels[:num_train]},
        "eval": {"images": train_imgs[num_train:], "labels": train_labels[num_train:]},
        "test": {"images": test_imgs, "labels": test_labels}}



In [None]:
# Define ImageDataset Class

from torch.utils.data import Dataset
from torchvision.transforms import Compose, Resize, ToTensor, RandomHorizontalFlip, RandomRotation, ToPILImage
class ImageDataset(Dataset):

    def __init__(self, data, split, im_size=28):
        self.split = split
        self.data = data[self.split]
        
        if split == "train":

            self.transform = Compose([
                Resize((im_size, im_size)),
                RandomHorizontalFlip(p=0.5),
                RandomRotation(degrees=45),
            ])
         
        elif self.split == "eval" or split == "test":
            self.transform = Compose([
                Resize((im_size, im_size)),
            ])
        
        
        if split == 'train':
            self.transform = Compose([
                Resize((im_size, im_size)),
                RandomHorizontalFlip(p=0.5),
                RandomRotation(degrees=45),
                ToTensor(),
            ])

        elif split == 'eval':
            self.transform = Compose([
                Resize((im_size, im_size)),
                ToTensor(),
            ])

        elif split == 'test':
            self.transform = Compose([
                Resize((im_size, im_size)),
                ToTensor(),
            ])

    def __len__(self):
        return self.data["images"].shape[0]

    def __getitem__(self, index):
        return {
            'image': self.data["images"][index,:,:].clone().detach().unsqueeze(0).tile([3,1,1]).to(dtype=torch.float),
            'label': self.data["labels"][index].clone().detach()
        }

In [None]:
# Prepare dataset & dataloader

from torch.utils.data import DataLoader

datasets = {}
dataloaders = {}
for split in ['train', 'eval', 'test']:
    datasets[split] = ImageDataset(data=data,
                                   split=split)

    dataloaders[split] = DataLoader(datasets[split],
                                    batch_size=32,
                                    shuffle=(split != 'test'),
                                    num_workers=4,
                                    pin_memory=False)# Was True before.

In [None]:
# Define model class

from torch import nn
from torchvision import models


class Net(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        model = models.resnet18(pretrained=True, progress=True)
        #print(model)

        # Adapt last layer to multi-label task
        model.fc = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=model.fc.in_features, out_features=128),
            nn.Linear(in_features=128, out_features=n_classes)
        )

        self.base_model = model
        self.sigm = nn.Sigmoid()


    def forward(self, x):
        return self.sigm(self.base_model(x))

In [None]:
# Define Trainer class

import time
import tqdm
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
class Trainer():

    @staticmethod
    def train_model(model, dataloaders, objective, optimizer, device, run, num_epochs=25):

        # Prepare
        since = time.time()
        best_model_wts = model.state_dict()
        best_loss = 999999
        running_losses = {'train': [], 'eval': []}
        
        # Loop
        writer = SummaryWriter('runs/mnist/'+run+"/")
        for epoch in range(num_epochs):

            # Progress
            print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
            print('-' * 10)

            for split in ['train', 'eval']:
                torch.set_grad_enabled(split == 'train')
                model.train(split=='train')
                total_loss = 0.0

                # Load and train/eval on split
                for data in tqdm.tqdm(dataloaders[split],colour='ffffff'):

                    # Put batch on device
                    inputs, labels = data['image'].to(device), data['label'].to(device, dtype=torch.int64)
                    labels = F.one_hot(labels, num_classes=10).to(dtype=torch.float)

                    # Forward and calc loss
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    
                    loss = objective(outputs, labels)
                    # Backprop if train
                    if split == 'train':
                        loss.backward()
                        optimizer.step()

                    total_loss += loss.item()
                
                
                
                # Record
                epoch_loss = total_loss / dataloaders[split].dataset.__len__()
                running_losses[split].append(epoch_loss)
                print('{} Loss: {:.4f}'.format(split, epoch_loss))
                writer.add_scalar('Loss/'+split, epoch_loss, epoch+1)
                
                # Deep copy if best loss
                if split == 'eval' and epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_model_wts = model.state_dict()



        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best Validation Loss: {:4f}'.format(best_loss))


        # load best model weights
        model.load_state_dict(best_model_wts)
        return model

In [None]:
# Init model
model = Net(n_classes=10).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
objective = torch.nn.BCELoss()

trained_model = Trainer.train_model(model=model,
                                    dataloaders=dataloaders,
                                    objective=objective,
                                    optimizer=optimizer,
                                    device=device,
                                    run="name_of_run",
                                    num_epochs=25)


In [None]:
import os
torch.save(model.state_dict(),"/home/arvid/models/mnist/model_weights")

In [None]:
%pip install tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir /home/arvid/code/mnist_test/runs/can_specify/