In [None]:
!nvidia-smi

Sat Aug  8 19:29:15 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    31W / 250W |    927MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time

In [51]:
class LeNet(nn.Module):
  
  def __init__(self, classes):
    super(LeNet, self).__init__()

    self.extract = nn.Sequential(
        nn.Conv2d(1, 10, 5), #LeNet-5 used 6 filters, I've used 10 filters
        nn.BatchNorm2d(10),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Dropout(p = 0.2),
        nn.Conv2d(10,20,5), #LeNet-5 used 16 filters, I've used 20 filters
        nn.BatchNorm2d(20),
        nn.ReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Dropout(p = 0.2),        
    )

    self.classify = nn.Sequential(
        nn.Linear(in_features=320, out_features=50), 
        #nn.BatchNorm1d(320),
        nn.ReLU(),
        nn.Linear(in_features=50, out_features=classes)
    )

  def forward(self, x):
    x = self.extract(x)
    x = torch.flatten(x, 1)
    logits = self.classify(x)
    return logits

In [46]:
train_transform=transforms.Compose([
           transforms.RandomHorizontalFlip(),                        
           transforms.ToTensor(),
           transforms.Normalize((0.1307,), (0.3081,))
           ])

test_transform=transforms.Compose([
           transforms.ToTensor(),
           transforms.Normalize((0.1307,), (0.3081,))
           ])

trainset = torchvision.datasets.MNIST(root='/content/data', train=True, download=True, transform=train_transform)
testset = torchvision.datasets.MNIST(root='/content/data', train=False, download=True, transform=test_transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=16)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True, num_workers=16)

dataset_sizes = {'train':len(trainset), 'val':len(testset)}
dataloaders = {'train':trainloader, 'val':testloader}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [47]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):

    for epoch in range(num_epochs):
        since = time.time()
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and test phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to test mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            
            time_elapsed = time.time() - since
            print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))            
            
            torch.save(model.state_dict(), '/content/MNIST_LeNet_Epoch_' + str(epoch+1) + '.pth')

        print()
    return model

In [48]:
model = LeNet(10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.15, momentum=0.9, nesterov=True)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [49]:
# prev_state = torch.load('/content/CIFAR10_LeNet_Epoch_35.pth')
# model.load_state_dict(prev_state)

In [50]:
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=50)

Epoch 1/50
----------
train Loss: 0.3015 Acc: 0.9041
Training complete in 0m 5s
val Loss: 0.0988 Acc: 0.9683
Training complete in 0m 6s

Epoch 2/50
----------
train Loss: 0.1353 Acc: 0.9571
Training complete in 0m 5s
val Loss: 0.0705 Acc: 0.9774
Training complete in 0m 6s

Epoch 3/50
----------
train Loss: 0.1072 Acc: 0.9664
Training complete in 0m 5s
val Loss: 0.0578 Acc: 0.9803
Training complete in 0m 6s

Epoch 4/50
----------
train Loss: 0.0942 Acc: 0.9699
Training complete in 0m 5s
val Loss: 0.0460 Acc: 0.9852
Training complete in 0m 6s

Epoch 5/50
----------
train Loss: 0.0904 Acc: 0.9715
Training complete in 0m 5s
val Loss: 0.0490 Acc: 0.9841
Training complete in 0m 6s

Epoch 6/50
----------
train Loss: 0.0803 Acc: 0.9748
Training complete in 0m 5s
val Loss: 0.0462 Acc: 0.9848
Training complete in 0m 6s

Epoch 7/50
----------
train Loss: 0.0795 Acc: 0.9749
Training complete in 0m 5s
val Loss: 0.0424 Acc: 0.9862
Training complete in 0m 6s

Epoch 8/50
----------
train Loss: 0.0753 