- Traget: Added Regularization like Dropout to reduce overfitting.
- Results: 
    - Parameters : 7348
    - Best Train accuracy : 99.07
    - Best Test accuracy : 99.40
- Analysis:
    - Since dropout was added, the gap between the train and test accuracy is decreased when comapred to the previous batch_norm code.
    - This prevents overfitting of the model, as some of the neurons' output was set to null

# Import Libraries

In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import models
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### Calculating the mean and std dev. of the dataset

In [2]:
tensor_transforms = transforms.Compose([transforms.ToTensor()])

exp = datasets.MNIST('./data', train=True, download=True, transform=tensor_transforms)

In [3]:
exp_train_data = exp.train_data
exp_test_data = exp.test_data
print(exp_train_data.shape)
print(exp_test_data.shape)


torch.Size([60000, 28, 28])
torch.Size([60000, 28, 28])


In [4]:
exp_train_data = exp.transform(exp_train_data.numpy())
print('[Train]')
print(' - Numpy Shape:', exp.train_data.cpu().numpy().shape)
print(' - Tensor Shape:', exp.train_data.size())
print(' - min:', torch.min(exp_train_data))
print(' - max:', torch.max(exp_train_data))
print(' - mean:', torch.mean(exp_train_data))
print(' - std:', torch.std(exp_train_data))
print(' - var:', torch.var(exp_train_data))

[Train]
 - Numpy Shape: (60000, 28, 28)
 - Tensor Shape: torch.Size([60000, 28, 28])
 - min: tensor(0.)
 - max: tensor(1.)
 - mean: tensor(0.1307)
 - std: tensor(0.3081)
 - var: tensor(0.0949)


## Data Transformations

We first start with defining our data transformations. We need to think what our data is and how can we augment it to correct represent images which it might not see otherwise.


In [5]:
# Train transformations (No other augmentations are added)
train_transforms = transforms.Compose([
    transforms.ToTensor(),  # Convert to tensor first before erasing 
    transforms.Normalize((0.1307,), (0.3081,)) # Normalize it with mean and std dev of train_data. 
])

# Test transformations
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # use the train data's mean and std dev
])



# Dataset and Creating Train/Test Split

In [6]:
train = datasets.MNIST('./data', train=True, download=True, transform=train_transforms) # downloading the train data and applying train transforms
test = datasets.MNIST('./data', train=False, download=True, transform=test_transforms) # downloading the test data and applying test transforms

# Dataloader Arguments & Test/Train Dataloaders


In [7]:
SEED = 42

# To check if we are using GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# For reproducibility
torch.manual_seed(SEED)

if device == "cuda":
    torch.cuda.manual_seed(SEED)


dataloader_args = dict(shuffle=True, batch_size=128, num_workers=-1, pin_memory=True) if device == "cuda" else dict(shuffle=True, batch_size=64)

# train dataloader
train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

Using device: cpu


# The model


In [8]:

# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         dropout_value = 0.05
        
#         # Input Block - RF: 3
#         self.convblock1 = nn.Sequential(
#             nn.Conv2d(1, 8, 3, padding=1, bias=False),  # RF: 3
#             nn.BatchNorm2d(8),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         # Conv Block 1 - RF: 5
#         self.convblock2 = nn.Sequential(
#             nn.Conv2d(8, 8, 3, padding=1, bias=False),  # RF: 5
#             nn.BatchNorm2d(8),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         # Transition Block 1 - RF: 6
#         self.pool1 = nn.MaxPool2d(2, 2)  # RF: 6
#         self.convblock3 = nn.Sequential(
#             nn.Conv2d(8, 12, 1, bias=False),  # RF: 6
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         # Conv Block 2 - RF: 14
#         self.convblock4 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 10
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 14
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         # Transition Block 2 - RF: 16
#         self.pool2 = nn.MaxPool2d(2, 2)  # RF: 16
#         self.convblock5 = nn.Sequential(
#             nn.Conv2d(12, 12, 1, bias=False),  # RF: 16
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         # Conv Block 3 - RF: 28
#         self.convblock6 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 20
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 24
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Dropout(dropout_value),
#             nn.Conv2d(12, 10, 3, padding=1, bias=False),  # RF: 28
#             nn.BatchNorm2d(10),
#             nn.ReLU(),
#             nn.Dropout(dropout_value)
#         )

#         self.gap = nn.AdaptiveAvgPool2d(1)

#     def forward(self, x):
#         x = self.convblock1(x)
#         x = self.convblock2(x)
#         x = self.pool1(x)
#         x = self.convblock3(x)
#         x = self.convblock4(x)
#         x = self.pool2(x)
#         x = self.convblock5(x)
#         x = self.convblock6(x)
#         x = self.gap(x)
#         x = x.view(-1, 10)
#         return F.log_softmax(x, dim=-1)
    

       



# Model Params


In [9]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
# model = Net().to(device)
model = models.Model_3_Dropout().to(device)
summary(model, input_size=(1, 28, 28))

cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              72
       BatchNorm2d-2            [-1, 8, 28, 28]              16
              ReLU-3            [-1, 8, 28, 28]               0
           Dropout-4            [-1, 8, 28, 28]               0
            Conv2d-5            [-1, 8, 28, 28]             576
       BatchNorm2d-6            [-1, 8, 28, 28]              16
              ReLU-7            [-1, 8, 28, 28]               0
           Dropout-8            [-1, 8, 28, 28]               0
         MaxPool2d-9            [-1, 8, 14, 14]               0
           Conv2d-10           [-1, 12, 14, 14]              96
      BatchNorm2d-11           [-1, 12, 14, 14]              24
             ReLU-12           [-1, 12, 14, 14]               0
          Dropout-13           [-1, 12, 14, 14]               0
           Conv2d-14           [-1,


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Training and Testing

Looking at logs can be boring, so we'll introduce **tqdm** progressbar to get cooler logs.

Let's write train and test functions

In [10]:
model = models.Model_3_Dropout().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4) # defining the optimizer with leraning rate of 0.01
# scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
#     optimizer,
#     T_0=4,  # Initial restart interval
#     T_mult=1,  # Multiplier for restart interval
#     eta_min=1e-6  # Minimum learning rate
# )

# Using the OneCycleLR scheduler for dynamic learning rate adjustment.
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,  # Maximum learning rate during the cycle.
    epochs=15,  # Total number of epochs for training.
    steps_per_epoch=len(train_loader),  # Number of steps in one epoch (based on train loader size).
    pct_start=0.2,  # Percentage of the cycle for increasing the learning rate.
    div_factor=10,  # Factor by which the initial learning rate is divided from max_lr.
    final_div_factor=100,  # Factor by which the learning rate is reduced at the end of the cycle.
)



In [11]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Update pbar-tqdm

    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

In [12]:
EPOCHS = 15
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, epoch)
    
    test(model, device, test_loader)
    # scheduler.step() # If in case of Cosine Annealing Warm, or based on other scheduler, scheduler.step.() has to uncommented. 
    

EPOCH: 0


Loss=0.1784517765045166 Batch_id=937 Accuracy=88.12: 100%|██████████| 938/938 [00:46<00:00, 20.16it/s] 



Test set: Average loss: 0.1593, Accuracy: 9719/10000 (97.19%)

EPOCH: 1


Loss=0.07454880326986313 Batch_id=937 Accuracy=96.39: 100%|██████████| 938/938 [00:47<00:00, 19.61it/s] 



Test set: Average loss: 0.0834, Accuracy: 9786/10000 (97.86%)

EPOCH: 2


Loss=0.11423830687999725 Batch_id=937 Accuracy=96.92: 100%|██████████| 938/938 [00:48<00:00, 19.52it/s] 



Test set: Average loss: 0.1005, Accuracy: 9699/10000 (96.99%)

EPOCH: 3


Loss=0.08282037824392319 Batch_id=937 Accuracy=97.32: 100%|██████████| 938/938 [00:50<00:00, 18.58it/s] 



Test set: Average loss: 0.0552, Accuracy: 9832/10000 (98.32%)

EPOCH: 4


Loss=0.08844122290611267 Batch_id=937 Accuracy=97.72: 100%|██████████| 938/938 [00:48<00:00, 19.28it/s] 



Test set: Average loss: 0.0410, Accuracy: 9860/10000 (98.60%)

EPOCH: 5


Loss=0.08420288562774658 Batch_id=937 Accuracy=97.86: 100%|██████████| 938/938 [00:47<00:00, 19.83it/s]  



Test set: Average loss: 0.0401, Accuracy: 9880/10000 (98.80%)

EPOCH: 6


Loss=0.1796770989894867 Batch_id=937 Accuracy=98.02: 100%|██████████| 938/938 [00:53<00:00, 17.44it/s]   



Test set: Average loss: 0.0309, Accuracy: 9904/10000 (99.04%)

EPOCH: 7


Loss=0.0014318001922219992 Batch_id=937 Accuracy=98.36: 100%|██████████| 938/938 [00:50<00:00, 18.48it/s]



Test set: Average loss: 0.0336, Accuracy: 9898/10000 (98.98%)

EPOCH: 8


Loss=0.09268324822187424 Batch_id=937 Accuracy=98.43: 100%|██████████| 938/938 [00:50<00:00, 18.50it/s]  



Test set: Average loss: 0.0247, Accuracy: 9923/10000 (99.23%)

EPOCH: 9


Loss=0.00857910793274641 Batch_id=937 Accuracy=98.62: 100%|██████████| 938/938 [00:47<00:00, 19.70it/s]  



Test set: Average loss: 0.0255, Accuracy: 9912/10000 (99.12%)

EPOCH: 10


Loss=0.004643022082746029 Batch_id=937 Accuracy=98.73: 100%|██████████| 938/938 [00:46<00:00, 20.01it/s] 



Test set: Average loss: 0.0242, Accuracy: 9924/10000 (99.24%)

EPOCH: 11


Loss=0.1825295388698578 Batch_id=937 Accuracy=98.86: 100%|██████████| 938/938 [00:46<00:00, 20.24it/s]   



Test set: Average loss: 0.0237, Accuracy: 9934/10000 (99.34%)

EPOCH: 12


Loss=0.004552979487925768 Batch_id=937 Accuracy=99.00: 100%|██████████| 938/938 [00:47<00:00, 19.78it/s] 



Test set: Average loss: 0.0185, Accuracy: 9938/10000 (99.38%)

EPOCH: 13


Loss=0.11577491462230682 Batch_id=937 Accuracy=99.07: 100%|██████████| 938/938 [00:51<00:00, 18.23it/s]  



Test set: Average loss: 0.0190, Accuracy: 9940/10000 (99.40%)

EPOCH: 14


Loss=0.005473337601870298 Batch_id=937 Accuracy=99.05: 100%|██████████| 938/938 [00:50<00:00, 18.40it/s] 



Test set: Average loss: 0.0204, Accuracy: 9935/10000 (99.35%)

