In [1]:
import os
os.chdir('/home/ubuntu/era_v3/era_v3_session7_assignment/')
os.getcwd()


'/home/ubuntu/era_v3/era_v3_session7_assignment'

In [2]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
import matplotlib.pyplot as plt
from torchsummary import summary
import os

In [3]:
from torchvision import datasets, transforms

### Utilities

In [4]:
from tqdm import tqdm
import torch.nn.functional as F

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(train_loader)

    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        
        # Compute loss
        loss = F.nll_loss(output, target)
        train_loss += loss.item() * data.size(0)  # accumulate batch loss
        
        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()
        
        # Compute accuracy
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        total += target.size(0)

        # Update progress bar
        pbar.set_description(desc=f"loss={loss.item():.4f} batch_id={batch_idx}")

    # Compute average loss and accuracy for the epoch
    avg_loss = train_loss / len(train_loader.dataset)
    accuracy = 100.0 * correct / total

    print(f"\nEpoch {epoch}: Train set: Average loss: {avg_loss:.4f}, Accuracy: {correct}/{total} ({accuracy:.2f}%)\n")
    return avg_loss, accuracy


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            # Compute loss
            test_loss += F.nll_loss(output, target, reduction="sum").item()
            
            # Compute accuracy
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    # Compute average loss and accuracy for the test set
    test_loss /= len(test_loader.dataset)
    accuracy = 100.0 * correct / len(test_loader.dataset)

    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), accuracy
        )
    )
    return test_loss, accuracy

In [5]:
## Checking if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [6]:
## Checking number of cores available
import multiprocessing
multiprocessing.cpu_count()

4

### Loading and Transforming Data

In [7]:
torch.manual_seed(42)
if device == 'cuda':
    torch.cuda.manual_seed(42)
batch_size = 1024
kwargs = {'num_workers': 8, 'pin_memory': True} if use_cuda else {}

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,)),
                        transforms.RandomAffine(
                            degrees=15,  # Random rotation up to 10 degrees
                        ),                        
                        transforms.RandomErasing(p=0.1),
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 134MB/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 12.2MB/s]

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 125MB/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.21MB/s]


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw





### Final Model

In [8]:
class MiniCNN_3(nn.Module):
    def __init__(self):
        super(MiniCNN_3, self).__init__()
        self.block_1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=8, kernel_size=(3, 3), padding=1, bias=False
            ),  # 28,1>28,8|RF:3,J:1
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=8, out_channels=8, kernel_size=(3, 3), padding=1, bias=False
            ),  # 28,8>28,8|RF:5,J:1
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=8,
                out_channels=16,
                kernel_size=(3, 3),
                padding=1,
                bias=False,
            ),  # 28,8>28,16|RF:7,J:1
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Dropout(0.15)
        )
        self.pool_1 = nn.MaxPool2d(2, 2)  # 28,16>14,16|RF:8,J:2
        self.transition_1 = nn.Conv2d(
            in_channels=16, out_channels=4, kernel_size=(1, 1), padding=0, bias=False
        )  # 14,16>14,8|RF:8,J:2
        self.block_2 = nn.Sequential(
            nn.Conv2d(
                in_channels=4, out_channels=8, kernel_size=(3, 3), padding=1, bias=False
            ),  # 14,8>14,8|RF:12,J:2
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=8,
                out_channels=16,
                kernel_size=(3, 3),
                padding=1,
                bias=False,
            ),  # 14,8>14,16|RF:16,J:2
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=16,
                out_channels=16,
                kernel_size=(3, 3),
                padding=1,
                bias=False,
            ),  # 14,16>14,16|RF:20,J:2
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.pool_2 = nn.MaxPool2d(2, 2)  # 14,16>7,16|RF:21,J:4
        self.transition_2 = nn.Conv2d(
            in_channels=16, out_channels=4, kernel_size=(1, 1), padding=0, bias=False
        )  # 14,16>7,4|RF:21,J:4
        self.fc = nn.Linear(4 * 7 * 7, 10)  # 7,4>10

    def forward(self, x):
        x = self.block_1(x)
        x = self.pool_1(x)
        x = self.transition_1(x)
        x = self.block_2(x)
        x = self.pool_2(x)
        x = self.transition_2(x)
        x = x.view(-1, 4 * 7 * 7)
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)

In [14]:
model = MiniCNN_3().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              72
       BatchNorm2d-2            [-1, 8, 28, 28]              16
              ReLU-3            [-1, 8, 28, 28]               0
            Conv2d-4            [-1, 8, 28, 28]             576
       BatchNorm2d-5            [-1, 8, 28, 28]              16
              ReLU-6            [-1, 8, 28, 28]               0
            Conv2d-7           [-1, 16, 28, 28]           1,152
       BatchNorm2d-8           [-1, 16, 28, 28]              32
              ReLU-9           [-1, 16, 28, 28]               0
          Dropout-10           [-1, 16, 28, 28]               0
        MaxPool2d-11           [-1, 16, 14, 14]               0
           Conv2d-12            [-1, 4, 14, 14]              64
           Conv2d-13            [-1, 8, 14, 14]             288
      BatchNorm2d-14            [-1, 8,

In [10]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode="max",
    factor=0.1,
    patience=1,
    verbose=True,
    min_lr=1e-6,
)

for epoch in range(1, 16):
    print(f"********* Epoch = {epoch} *********")
    train(model, device, train_loader, optimizer, epoch)
    _, acc = test(model, device, test_loader)
    scheduler.step(acc)
    print("LR = ", scheduler.get_last_lr())



********* Epoch = 1 *********


loss=0.1870 batch_id=58: 100%|██████████| 59/59 [00:30<00:00,  1.96it/s]


Epoch 1: Train set: Average loss: 0.3856, Accuracy: 52828/60000 (88.05%)







Test set: Average loss: 0.0768, Accuracy: 9765/10000 (97.65%)

LR =  [0.01]
********* Epoch = 2 *********


loss=0.0896 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.43it/s]


Epoch 2: Train set: Average loss: 0.1262, Accuracy: 57600/60000 (96.00%)







Test set: Average loss: 0.0573, Accuracy: 9831/10000 (98.31%)

LR =  [0.01]
********* Epoch = 3 *********


loss=0.0878 batch_id=58: 100%|██████████| 59/59 [00:12<00:00,  4.54it/s]


Epoch 3: Train set: Average loss: 0.1006, Accuracy: 58078/60000 (96.80%)







Test set: Average loss: 0.0559, Accuracy: 9826/10000 (98.26%)

LR =  [0.01]
********* Epoch = 4 *********


loss=0.1152 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.53it/s]


Epoch 4: Train set: Average loss: 0.0923, Accuracy: 58166/60000 (96.94%)







Test set: Average loss: 0.0454, Accuracy: 9865/10000 (98.65%)

LR =  [0.01]
********* Epoch = 5 *********


loss=0.0687 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.47it/s]


Epoch 5: Train set: Average loss: 0.0844, Accuracy: 58383/60000 (97.31%)







Test set: Average loss: 0.0388, Accuracy: 9867/10000 (98.67%)

LR =  [0.01]
********* Epoch = 6 *********


loss=0.0731 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.53it/s]


Epoch 6: Train set: Average loss: 0.0789, Accuracy: 58462/60000 (97.44%)







Test set: Average loss: 0.0382, Accuracy: 9880/10000 (98.80%)

LR =  [0.01]
********* Epoch = 7 *********


loss=0.0570 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.50it/s]


Epoch 7: Train set: Average loss: 0.0749, Accuracy: 58505/60000 (97.51%)







Test set: Average loss: 0.0327, Accuracy: 9891/10000 (98.91%)

LR =  [0.01]
********* Epoch = 8 *********


loss=0.0520 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.42it/s]


Epoch 8: Train set: Average loss: 0.0715, Accuracy: 58588/60000 (97.65%)







Test set: Average loss: 0.0270, Accuracy: 9917/10000 (99.17%)

LR =  [0.01]
********* Epoch = 9 *********


loss=0.0654 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.27it/s]


Epoch 9: Train set: Average loss: 0.0678, Accuracy: 58641/60000 (97.73%)







Test set: Average loss: 0.0259, Accuracy: 9917/10000 (99.17%)

LR =  [0.01]
********* Epoch = 10 *********


loss=0.0674 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.41it/s]


Epoch 10: Train set: Average loss: 0.0676, Accuracy: 58660/60000 (97.77%)







Test set: Average loss: 0.0289, Accuracy: 9899/10000 (98.99%)

LR =  [0.001]
********* Epoch = 11 *********


loss=0.0554 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.40it/s]


Epoch 11: Train set: Average loss: 0.0552, Accuracy: 58913/60000 (98.19%)







Test set: Average loss: 0.0219, Accuracy: 9936/10000 (99.36%)

LR =  [0.001]
********* Epoch = 12 *********


loss=0.0657 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.22it/s]


Epoch 12: Train set: Average loss: 0.0517, Accuracy: 58984/60000 (98.31%)







Test set: Average loss: 0.0206, Accuracy: 9933/10000 (99.33%)

LR =  [0.001]
********* Epoch = 13 *********


loss=0.0367 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.45it/s]


Epoch 13: Train set: Average loss: 0.0508, Accuracy: 58994/60000 (98.32%)







Test set: Average loss: 0.0197, Accuracy: 9942/10000 (99.42%)

LR =  [0.001]
********* Epoch = 14 *********


loss=0.0445 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.51it/s]


Epoch 14: Train set: Average loss: 0.0484, Accuracy: 59031/60000 (98.39%)







Test set: Average loss: 0.0194, Accuracy: 9939/10000 (99.39%)

LR =  [0.001]
********* Epoch = 15 *********


loss=0.0502 batch_id=58: 100%|██████████| 59/59 [00:13<00:00,  4.51it/s]


Epoch 15: Train set: Average loss: 0.0474, Accuracy: 59047/60000 (98.41%)







Test set: Average loss: 0.0197, Accuracy: 9939/10000 (99.39%)

LR =  [0.0001]


### EC2 details

In [16]:
!nvidia-smi

Fri Dec 13 11:34:10 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       On  |   00000000:00:1E.0 Off |                    0 |
| N/A   30C    P0             31W /   70W |     773MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [19]:
!sudo dmidecode -t bios

# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 2.7 present.

Handle 0x0000, DMI type 0, 24 bytes
BIOS Information
	Vendor: Amazon EC2
	Version: 1.0
	Release Date: 10/16/2017
	Address: 0xF0000
	Runtime Size: 64 kB
	ROM Size: 64 kB
	Characteristics:
		PCI is supported
		EDD is supported
		ACPI is supported
		System is a virtual machine
	BIOS Revision: 1.0

