- Traget: Add Batch Normalization to increase the model's accuracy
- Results: 
    - Parameters : 7348
    - Best Train accuracy : 99.66
    - Best Test accuracy : 99.32
- Analysis: 
    - Efficiency of the model is increased
    - But the model is still overfitting

# Import Libraries

In [14]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
%matplotlib inline
import matplotlib.pyplot as plt
import models
import warnings
warnings.filterwarnings("ignore")

### Calculating the mean and std dev. of the dataset

In [15]:
tensor_transforms = transforms.Compose([transforms.ToTensor()])

exp = datasets.MNIST('./data', train=True, download=True, transform=tensor_transforms)

In [16]:
exp_train_data = exp.train_data
exp_test_data = exp.test_data
print(exp_train_data.shape)
print(exp_test_data.shape)


torch.Size([60000, 28, 28])
torch.Size([60000, 28, 28])


In [17]:
exp_train_data = exp.transform(exp_train_data.numpy())
print('[Train]')
print(' - Numpy Shape:', exp.train_data.cpu().numpy().shape)
print(' - Tensor Shape:', exp.train_data.size())
print(' - min:', torch.min(exp_train_data))
print(' - max:', torch.max(exp_train_data))
print(' - mean:', torch.mean(exp_train_data))
print(' - std:', torch.std(exp_train_data))
print(' - var:', torch.var(exp_train_data))

[Train]
 - Numpy Shape: (60000, 28, 28)
 - Tensor Shape: torch.Size([60000, 28, 28])
 - min: tensor(0.)
 - max: tensor(1.)
 - mean: tensor(0.1307)
 - std: tensor(0.3081)
 - var: tensor(0.0949)


## Data Transformations

We first start with defining our data transformations. We need to think what our data is and how can we augment it to correct represent images which it might not see otherwise.


In [18]:
# Train transformations (No other augmentations are added)
train_transforms = transforms.Compose([
    transforms.ToTensor(),  # Convert to tensor first before erasing 
    transforms.Normalize((0.1307,), (0.3081,)) # Normalize it with mean and std dev of train_data. 
])

# Test transformations
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # use the train data's mean and std dev
])



# Dataset and Creating Train/Test Split

In [19]:
train = datasets.MNIST('./data', train=True, download=True, transform=train_transforms) # downloading the train data and applying train transforms
test = datasets.MNIST('./data', train=False, download=True, transform=test_transforms) # downloading the test data and applying test transforms

# Dataloader Arguments & Test/Train Dataloaders


In [11]:
SEED = 42

# To check if we are using GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# For reproducibility
torch.manual_seed(SEED)

if device == "cuda":
    torch.cuda.manual_seed(SEED)


dataloader_args = dict(shuffle=True, batch_size=128, num_workers=-1, pin_memory=True) if device == "cuda" else dict(shuffle=True, batch_size=64)

# train dataloader
train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

Using device: cpu


# The model


In [20]:


# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
        
#         # Input Block - RF: 3
#         self.convblock1 = nn.Sequential(
#             nn.Conv2d(1, 8, 3, padding=1, bias=False),  # RF: 3
#             nn.BatchNorm2d(8),
#             nn.ReLU(),
#         )

#         # Conv Block 1 - RF: 5
#         self.convblock2 = nn.Sequential(
#             nn.Conv2d(8, 8, 3, padding=1, bias=False),  # RF: 5
#             nn.BatchNorm2d(8),
#             nn.ReLU(),
#         )

#         # Transition Block 1 - RF: 6
#         self.pool1 = nn.MaxPool2d(2, 2)  # RF: 6
#         self.convblock3 = nn.Sequential(
#             nn.Conv2d(8, 12, 1, bias=False),  # RF: 6
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#         )

#         # Conv Block 2 - RF: 14
#         self.convblock4 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 10
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 14
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#         )

#         # Transition Block 2 - RF: 16
#         self.pool2 = nn.MaxPool2d(2, 2)  # RF: 16
#         self.convblock5 = nn.Sequential(
#             nn.Conv2d(12, 12, 1, bias=False),  # RF: 16
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#         )

#         # Conv Block 3 - RF: 28
#         self.convblock6 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 20
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 24
#             nn.BatchNorm2d(12),
#             nn.ReLU(),
#             nn.Conv2d(12, 10, 3, padding=1, bias=False),  # RF: 28
#             nn.BatchNorm2d(10),
#             nn.ReLU(),
#         )

#         self.gap = nn.AdaptiveAvgPool2d(1)

#     def forward(self, x):
#         x = self.convblock1(x)
#         x = self.convblock2(x)
#         x = self.pool1(x)
#         x = self.convblock3(x)
#         x = self.convblock4(x)
#         x = self.pool2(x)
#         x = self.convblock5(x)
#         x = self.convblock6(x)
#         x = self.gap(x)
#         x = x.view(-1, 10)
#         return F.log_softmax(x, dim=-1)
    

       



# Model Params


In [21]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
# model = Net().to(device)
model = models.Model_2_BatchNorm().to(device)
summary(model, input_size=(1, 28, 28))

cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              72
       BatchNorm2d-2            [-1, 8, 28, 28]              16
              ReLU-3            [-1, 8, 28, 28]               0
            Conv2d-4            [-1, 8, 28, 28]             576
       BatchNorm2d-5            [-1, 8, 28, 28]              16
              ReLU-6            [-1, 8, 28, 28]               0
         MaxPool2d-7            [-1, 8, 14, 14]               0
            Conv2d-8           [-1, 12, 14, 14]              96
       BatchNorm2d-9           [-1, 12, 14, 14]              24
             ReLU-10           [-1, 12, 14, 14]               0
           Conv2d-11           [-1, 12, 14, 14]           1,296
      BatchNorm2d-12           [-1, 12, 14, 14]              24
             ReLU-13           [-1, 12, 14, 14]               0
           Conv2d-14           [-1,


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Training and Testing

Looking at logs can be boring, so we'll introduce **tqdm** progressbar to get cooler logs.

Let's write train and test functions

In [22]:
model = models.Model_2_BatchNorm().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4) # defining the optimizer with leraning rate of 0.01
# scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
#     optimizer,
#     T_0=4,  # Initial restart interval
#     T_mult=1,  # Multiplier for restart interval
#     eta_min=1e-6  # Minimum learning rate
# )

# Using the OneCycleLR scheduler for dynamic learning rate adjustment.
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,  # Maximum learning rate during the cycle.
    epochs=15,  # Total number of epochs for training.
    steps_per_epoch=len(train_loader),  # Number of steps in one epoch (based on train loader size).
    pct_start=0.2,  # Percentage of the cycle for increasing the learning rate.
    div_factor=10,  # Factor by which the initial learning rate is divided from max_lr.
    final_div_factor=100,  # Factor by which the learning rate is reduced at the end of the cycle.
)



In [23]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Update pbar-tqdm

    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

In [24]:
EPOCHS = 15
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, epoch)
    
    test(model, device, test_loader)
    # scheduler.step() # If in case of Cosine Annealing Warm, or based on other scheduler, scheduler.step.() has to uncommented. 
    

EPOCH: 0


Loss=0.30982446670532227 Batch_id=937 Accuracy=91.07: 100%|██████████| 938/938 [00:45<00:00, 20.48it/s]



Test set: Average loss: 0.1771, Accuracy: 9733/10000 (97.33%)

EPOCH: 1


Loss=0.11411292850971222 Batch_id=937 Accuracy=97.05: 100%|██████████| 938/938 [00:45<00:00, 20.53it/s] 



Test set: Average loss: 0.1067, Accuracy: 9720/10000 (97.20%)

EPOCH: 2


Loss=0.1885312795639038 Batch_id=937 Accuracy=97.54: 100%|██████████| 938/938 [00:43<00:00, 21.46it/s]  



Test set: Average loss: 0.1000, Accuracy: 9738/10000 (97.38%)

EPOCH: 3


Loss=0.16393055021762848 Batch_id=937 Accuracy=97.80: 100%|██████████| 938/938 [00:37<00:00, 25.04it/s] 



Test set: Average loss: 0.0620, Accuracy: 9813/10000 (98.13%)

EPOCH: 4


Loss=0.06097063794732094 Batch_id=937 Accuracy=98.18: 100%|██████████| 938/938 [00:33<00:00, 27.63it/s]  



Test set: Average loss: 0.1297, Accuracy: 9597/10000 (95.97%)

EPOCH: 5


Loss=0.1434718370437622 Batch_id=937 Accuracy=98.42: 100%|██████████| 938/938 [00:33<00:00, 28.13it/s]   



Test set: Average loss: 0.0526, Accuracy: 9827/10000 (98.27%)

EPOCH: 6


Loss=0.021315909922122955 Batch_id=937 Accuracy=98.50: 100%|██████████| 938/938 [00:33<00:00, 27.61it/s] 



Test set: Average loss: 0.0285, Accuracy: 9911/10000 (99.11%)

EPOCH: 7


Loss=0.040815889835357666 Batch_id=937 Accuracy=98.73: 100%|██████████| 938/938 [00:41<00:00, 22.46it/s] 



Test set: Average loss: 0.0336, Accuracy: 9902/10000 (99.02%)

EPOCH: 8


Loss=0.02586159110069275 Batch_id=937 Accuracy=98.96: 100%|██████████| 938/938 [00:42<00:00, 21.88it/s]  



Test set: Average loss: 0.0268, Accuracy: 9916/10000 (99.16%)

EPOCH: 9


Loss=0.0024530356749892235 Batch_id=937 Accuracy=99.04: 100%|██████████| 938/938 [00:47<00:00, 19.96it/s]



Test set: Average loss: 0.0254, Accuracy: 9924/10000 (99.24%)

EPOCH: 10


Loss=0.0007032179273664951 Batch_id=937 Accuracy=99.24: 100%|██████████| 938/938 [00:55<00:00, 17.02it/s] 



Test set: Average loss: 0.0212, Accuracy: 9927/10000 (99.27%)

EPOCH: 11


Loss=0.03041321039199829 Batch_id=937 Accuracy=99.37: 100%|██████████| 938/938 [00:50<00:00, 18.55it/s]   



Test set: Average loss: 0.0220, Accuracy: 9931/10000 (99.31%)

EPOCH: 12


Loss=0.030288003385066986 Batch_id=937 Accuracy=99.55: 100%|██████████| 938/938 [00:44<00:00, 20.95it/s]  



Test set: Average loss: 0.0213, Accuracy: 9927/10000 (99.27%)

EPOCH: 13


Loss=0.06601667404174805 Batch_id=937 Accuracy=99.66: 100%|██████████| 938/938 [00:45<00:00, 20.62it/s]   



Test set: Average loss: 0.0195, Accuracy: 9932/10000 (99.32%)

EPOCH: 14


Loss=0.002016632119193673 Batch_id=937 Accuracy=99.66: 100%|██████████| 938/938 [00:43<00:00, 21.47it/s]  



Test set: Average loss: 0.0193, Accuracy: 9932/10000 (99.32%)

