In [14]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
import matplotlib.pyplot as plt
from torchsummary import summary
import os

In [None]:
# os.getcwd()
# os.chdir("/Users/aditya/Documents/self_learning/ERA V3/week 7/micro_cnn")

'/Users/aditya/Documents/self_learning/ERA V3/week 7/micro_cnn'

### Functions & utilities

In [7]:
from src.utils import plot_random_mnist_images

In [8]:
from tqdm import tqdm
import torch.nn.functional as F

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(train_loader)

    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        
        # Compute loss
        loss = F.nll_loss(output, target)
        train_loss += loss.item() * data.size(0)  # accumulate batch loss
        
        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()
        
        # Compute accuracy
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        total += target.size(0)

        # Update progress bar
        pbar.set_description(desc=f"loss={loss.item():.4f} batch_id={batch_idx}")

    # Compute average loss and accuracy for the epoch
    avg_loss = train_loss / len(train_loader.dataset)
    accuracy = 100.0 * correct / total

    print(f"\nEpoch {epoch}: Train set: Average loss: {avg_loss:.4f}, Accuracy: {correct}/{total} ({accuracy:.2f}%)\n")
    return avg_loss, accuracy


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            # Compute loss
            test_loss += F.nll_loss(output, target, reduction="sum").item()
            
            # Compute accuracy
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    # Compute average loss and accuracy for the test set
    test_loss /= len(test_loader.dataset)
    accuracy = 100.0 * correct / len(test_loader.dataset)

    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), accuracy
        )
    )
    return test_loss, accuracy

In [None]:
## Checking if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cpu


In [None]:
## Checking number of cores available
import multiprocessing
multiprocessing.cpu_count()

8

### Loading MNIST data

In [11]:
torch.manual_seed(42)
if device == 'cuda':
    torch.cuda.manual_seed(42)
batch_size = 512
kwargs = {'num_workers': 8, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,)),
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:37<00:00, 264670.41it/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 111024.78it/s]


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:18<00:00, 86875.94it/s] 


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 1006845.77it/s]

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw






### Creating Base Model
- Aim in this step is to create a based model first which adhere to the constraints of less than 8k parametes
- No major data transformation is used except for normalization
- We will closely monitor the train and test accuracy and loss to understand the model performance
- In assignment 6 we observed one key point around **LR schedueler**. It significantly **improves the training loss convergence**. We will use the same here

In [13]:
from src.model import MiniCNN_1

In [15]:
model = MiniCNN_1().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              72
              ReLU-2            [-1, 8, 28, 28]               0
            Conv2d-3            [-1, 8, 28, 28]             576
              ReLU-4            [-1, 8, 28, 28]               0
            Conv2d-5            [-1, 8, 28, 28]             576
              ReLU-6            [-1, 8, 28, 28]               0
         MaxPool2d-7            [-1, 8, 14, 14]               0
            Conv2d-8            [-1, 4, 14, 14]              32
            Conv2d-9            [-1, 8, 14, 14]             288
             ReLU-10            [-1, 8, 14, 14]               0
           Conv2d-11           [-1, 16, 14, 14]           1,152
             ReLU-12           [-1, 16, 14, 14]               0
           Conv2d-13           [-1, 16, 14, 14]           2,304
             ReLU-14           [-1, 16,

#### Layerwise Details

| Layer | Input Size | Output Size | Receptive Field | Jump In | Jump Out |
|-------|------------|-------------|-----------------|----------|-----------|
| Conv1 | 28x28x1 | 28x28x8 | 3 | 1 | 1 |
| Conv2 | 28x28x8 | 28x28x8 | 5 | 1 | 1 |
| Conv3 | 28x28x8 | 28x28x8 | 7 | 1 | 1 |
| MaxPool1 | 28x28x8 | 14x14x8 | 8 | 1 | 2 |
| Transition-Conv1 | 14x14x8 | 14x14x4 | 8 | 2 | 2 |
| Conv4 | 14x14x4 | 14x14x8 | 12 | 2 | 2 |
| Conv5 | 14x14x8 | 14x14x16 | 16 | 2 | 2 |
| Conv6 | 14x14x16 | 14x14x16 | 20 | 2 | 2 |
| MaxPool2 | 14x14x16 | 7x7x16 | 21 | 2 | 4 |
| Transition-Conv1 | 7x7x16 | 7x7x3 | 21 | 4 | 4 |
| FC | 7x7x3 | 10 | - | - | - |


In [16]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode="max",
    factor=0.1,
    patience=2,
    verbose=True,
    min_lr=1e-6,
)

for epoch in range(1, 16):
    print(f"********* Epoch = {epoch} *********")
    train(model, device, train_loader, optimizer, epoch)
    _, acc = test(model, device, test_loader)
    scheduler.step(acc)



********* Epoch = 1 *********


loss=0.1123 batch_id=117: 100%|██████████| 118/118 [00:21<00:00,  5.40it/s]



Epoch 1: Train set: Average loss: 0.4793, Accuracy: 50720/60000 (84.53%)


Test set: Average loss: 0.0884, Accuracy: 9699/10000 (96.99%)

********* Epoch = 2 *********


loss=0.0897 batch_id=117: 100%|██████████| 118/118 [00:21<00:00,  5.40it/s]



Epoch 2: Train set: Average loss: 0.0841, Accuracy: 58471/60000 (97.45%)


Test set: Average loss: 0.0583, Accuracy: 9810/10000 (98.10%)

********* Epoch = 3 *********


loss=0.0231 batch_id=117: 100%|██████████| 118/118 [00:21<00:00,  5.37it/s]



Epoch 3: Train set: Average loss: 0.0652, Accuracy: 58801/60000 (98.00%)


Test set: Average loss: 0.0518, Accuracy: 9839/10000 (98.39%)

********* Epoch = 4 *********


loss=0.1406 batch_id=117: 100%|██████████| 118/118 [00:23<00:00,  5.10it/s]



Epoch 4: Train set: Average loss: 0.0543, Accuracy: 59012/60000 (98.35%)


Test set: Average loss: 0.0370, Accuracy: 9871/10000 (98.71%)

********* Epoch = 5 *********


loss=0.0267 batch_id=117: 100%|██████████| 118/118 [00:23<00:00,  5.08it/s]



Epoch 5: Train set: Average loss: 0.0457, Accuracy: 59178/60000 (98.63%)


Test set: Average loss: 0.0376, Accuracy: 9867/10000 (98.67%)

********* Epoch = 6 *********


loss=0.0315 batch_id=117: 100%|██████████| 118/118 [00:23<00:00,  5.00it/s]



Epoch 6: Train set: Average loss: 0.0402, Accuracy: 59227/60000 (98.71%)


Test set: Average loss: 0.0568, Accuracy: 9817/10000 (98.17%)

********* Epoch = 7 *********


loss=0.0690 batch_id=117: 100%|██████████| 118/118 [00:23<00:00,  4.95it/s]



Epoch 7: Train set: Average loss: 0.0363, Accuracy: 59305/60000 (98.84%)


Test set: Average loss: 0.0427, Accuracy: 9874/10000 (98.74%)

********* Epoch = 8 *********


loss=0.0153 batch_id=117: 100%|██████████| 118/118 [00:23<00:00,  4.93it/s]



Epoch 8: Train set: Average loss: 0.0367, Accuracy: 59333/60000 (98.89%)


Test set: Average loss: 0.0384, Accuracy: 9873/10000 (98.73%)

********* Epoch = 9 *********


loss=0.0154 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.85it/s]



Epoch 9: Train set: Average loss: 0.0337, Accuracy: 59375/60000 (98.96%)


Test set: Average loss: 0.0444, Accuracy: 9858/10000 (98.58%)

********* Epoch = 10 *********


loss=0.0037 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.85it/s]



Epoch 10: Train set: Average loss: 0.0291, Accuracy: 59439/60000 (99.06%)


Test set: Average loss: 0.0424, Accuracy: 9875/10000 (98.75%)

********* Epoch = 11 *********


loss=0.0116 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.84it/s]



Epoch 11: Train set: Average loss: 0.0279, Accuracy: 59461/60000 (99.10%)


Test set: Average loss: 0.0413, Accuracy: 9877/10000 (98.77%)

********* Epoch = 12 *********


loss=0.0253 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.80it/s]



Epoch 12: Train set: Average loss: 0.0291, Accuracy: 59444/60000 (99.07%)


Test set: Average loss: 0.0332, Accuracy: 9906/10000 (99.06%)

********* Epoch = 13 *********


loss=0.0024 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.80it/s]



Epoch 13: Train set: Average loss: 0.0253, Accuracy: 59521/60000 (99.20%)


Test set: Average loss: 0.0413, Accuracy: 9874/10000 (98.74%)

********* Epoch = 14 *********


loss=0.0757 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.79it/s]



Epoch 14: Train set: Average loss: 0.0274, Accuracy: 59452/60000 (99.09%)


Test set: Average loss: 0.0491, Accuracy: 9856/10000 (98.56%)

********* Epoch = 15 *********


loss=0.0372 batch_id=117: 100%|██████████| 118/118 [00:24<00:00,  4.80it/s]



Epoch 15: Train set: Average loss: 0.0281, Accuracy: 59467/60000 (99.11%)


Test set: Average loss: 0.0402, Accuracy: 9884/10000 (98.84%)



In [17]:
scheduler.get_last_lr()

[0.001]

#### Observations
- Max Train accuracy: 99.20% (Epoch 13)
- Max Test accuracy: 99.06% (Epoch 12)
- Model has reached max accuracy of 99.20% on training data but it is still long way behind 99.4% mark
- There is a significant gap between train and test accuracy
