In [65]:
!nvidia-smi

Sat Aug  8 18:54:57 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    36W / 250W |    903MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time

In [59]:
class LeNet(nn.Module):
  
  def __init__(self, classes):
    super(LeNet, self).__init__()

    self.extract = nn.Sequential(
        nn.Conv2d(in_channels=3, out_channels=20, kernel_size=5, stride=1), #LeNet-5 used 6 filters, I've used 20 filters
        nn.BatchNorm2d(20),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Dropout(p = 0.2),
        nn.Conv2d(in_channels=20, out_channels=50, kernel_size=5, stride=1), #LeNet-5 used 16 filters, I've used 50 filters
        nn.BatchNorm2d(50),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Dropout(p = 0.2),        
    )

    self.classify = nn.Sequential(
        nn.Linear(in_features=1250, out_features=500), #I've used 500 FC channels instead of 120, expect CIFAR to contain more distinct features than MNIST
        nn.BatchNorm1d(500),
        nn.ReLU(),
        nn.Linear(in_features=500, out_features=classes)
    )

  def forward(self, x):
    x = self.extract(x)
    x = torch.flatten(x, 1)
    logits = self.classify(x)
    return logits

In [60]:
train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.4247, 0.4150, 0.3840], [0.2827, 0.2777, 0.2844])]) #Calculated using https://github.com/Armour/pytorch-nn-practice/blob/master/utils/meanstd.py
test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize([0.4942, 0.4851, 0.4504], [0.2467, 0.2429, 0.2616])]) 

trainset = torchvision.datasets.CIFAR10(root='/content/data', train=True, download=True, transform=train_transform)
testset = torchvision.datasets.CIFAR10(root='/content/data', train=False, download=True, transform=test_transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=16)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True, num_workers=16)

dataset_sizes = {'train':len(trainset), 'val':len(testset)}
dataloaders = {'train':trainloader, 'val':testloader}

class_names = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Files already downloaded and verified
Files already downloaded and verified


In [56]:
def online_mean_and_sd(loader):

  """Compute the mean and sd in an online fashion

        Var[x] = E[X^2] - E^2[X]
    """
  cnt = 0
  fst_moment = torch.empty(3)
  snd_moment = torch.empty(3)

  for images, _ in loader:

      b, c, h, w = images.shape
      nb_pixels = b * h * w
      sum_ = torch.sum(images, dim=[0, 2, 3])
      sum_of_square = torch.sum(images ** 2, dim=[0, 2, 3])
      fst_moment = (cnt * fst_moment + sum_) / (cnt + nb_pixels)
      snd_moment = (cnt * snd_moment + sum_of_square) / (cnt + nb_pixels)

      cnt += nb_pixels

  return fst_moment, torch.sqrt(snd_moment - fst_moment ** 2)

food101_mean, food101_std = online_mean_and_sd(dataloaders["train"])
print(food101_mean, food101_std)

tensor([-0.0006, -0.0007, -0.0007]) tensor([1.0002, 1.0003, 0.9999])


In [39]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):

    for epoch in range(num_epochs):
        since = time.time()
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and test phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to test mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            
            time_elapsed = time.time() - since
            print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))            
            
            torch.save(model.state_dict(), '/content/CIFAR10_LeNet_Epoch_' + str(epoch+1) + '.pth')

        print()
    return model

In [63]:
model = LeNet(10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.15, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [None]:
prev_state = torch.load('/content/CIFAR10_LeNet_Epoch_35.pth')
model.load_state_dict(prev_state)

In [64]:
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=50)

Epoch 1/50
----------
train Loss: 1.7390 Acc: 0.3816
Training complete in 0m 7s
val Loss: 1.4100 Acc: 0.4894
Training complete in 0m 9s

Epoch 2/50
----------
train Loss: 1.4184 Acc: 0.4870
Training complete in 0m 7s
val Loss: 1.2388 Acc: 0.5546
Training complete in 0m 9s

Epoch 3/50
----------
train Loss: 1.2859 Acc: 0.5368
Training complete in 0m 7s
val Loss: 1.1484 Acc: 0.5896
Training complete in 0m 9s

Epoch 4/50
----------
train Loss: 1.1920 Acc: 0.5766
Training complete in 0m 7s
val Loss: 1.0809 Acc: 0.6142
Training complete in 0m 9s

Epoch 5/50
----------
train Loss: 1.1252 Acc: 0.5993
Training complete in 0m 7s
val Loss: 1.0422 Acc: 0.6352
Training complete in 0m 9s

Epoch 6/50
----------
train Loss: 1.0672 Acc: 0.6204
Training complete in 0m 7s
val Loss: 0.9991 Acc: 0.6549
Training complete in 0m 9s

Epoch 7/50
----------
train Loss: 1.0249 Acc: 0.6373
Training complete in 0m 7s
val Loss: 1.0033 Acc: 0.6536
Training complete in 0m 9s

Epoch 8/50
----------
train Loss: 0.9931 