- Traget: Add Batch Normalization to increase the model's accuracy
- Results: 
    - Parameters : 7348
    - Best Train accuracy : 80.05
    - Best Test accuracy : 79.25
- Analysis: 
    - This modes is overfitting
    - The accuracy is too low

# Import Libraries

In [3]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
%matplotlib inline
import matplotlib.pyplot as plt
import models
import warnings
warnings.filterwarnings("ignore")


### Calculating the mean and std dev. of the dataset

In [2]:
tensor_transforms = transforms.Compose([transforms.ToTensor()])

exp = datasets.MNIST('./data', train=True, download=True, transform=tensor_transforms)

In [3]:
exp_train_data = exp.train_data
exp_test_data = exp.test_data
print(exp_train_data.shape)
print(exp_test_data.shape)


torch.Size([60000, 28, 28])
torch.Size([60000, 28, 28])


In [4]:
exp_train_data = exp.transform(exp_train_data.numpy())
print('[Train]')
print(' - Numpy Shape:', exp.train_data.cpu().numpy().shape)
print(' - Tensor Shape:', exp.train_data.size())
print(' - min:', torch.min(exp_train_data))
print(' - max:', torch.max(exp_train_data))
print(' - mean:', torch.mean(exp_train_data))
print(' - std:', torch.std(exp_train_data))
print(' - var:', torch.var(exp_train_data))

[Train]
 - Numpy Shape: (60000, 28, 28)
 - Tensor Shape: torch.Size([60000, 28, 28])
 - min: tensor(0.)
 - max: tensor(1.)
 - mean: tensor(0.1307)
 - std: tensor(0.3081)
 - var: tensor(0.0949)


## Data Transformations

We first start with defining our data transformations. We need to think what our data is and how can we augment it to correct represent images which it might not see otherwise.


In [5]:
# Train transformations (No other augmentations are added)
train_transforms = transforms.Compose([
    transforms.ToTensor(),  # Convert to tensor first before erasing 
    transforms.Normalize((0.1307,), (0.3081,)) # Normalize it with mean and std dev of train_data. 
])

# Test transformations
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # use the train data's mean and std dev
])



# Dataset and Creating Train/Test Split

In [6]:
train = datasets.MNIST('./data', train=True, download=True, transform=train_transforms) # downloading the train data and applying train transforms
test = datasets.MNIST('./data', train=False, download=True, transform=test_transforms) # downloading the test data and applying test transforms

# Dataloader Arguments & Test/Train Dataloaders


In [7]:
SEED = 42

# To check if we are using GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# For reproducibility
torch.manual_seed(SEED)

if device == "cuda":
    torch.cuda.manual_seed(SEED)


dataloader_args = dict(shuffle=True, batch_size=128, num_workers=-1, pin_memory=True) if device == "cuda" else dict(shuffle=True, batch_size=64)

# train dataloader
train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

Using device: cpu


# The model


In [8]:


# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
        
#         # Input Block - RF: 3
#         self.convblock1 = nn.Sequential(
#             nn.Conv2d(1, 8, 3, padding=1, bias=False),  # RF: 3
#             nn.ReLU(),
#         )

#         # Conv Block 1 - RF: 5
#         self.convblock2 = nn.Sequential(
#             nn.Conv2d(8, 8, 3, padding=1, bias=False),  # RF: 5
#             nn.ReLU(),
#         )

#         # Transition Block 1 - RF: 6
#         self.pool1 = nn.MaxPool2d(2, 2)  # RF: 6
#         self.convblock3 = nn.Sequential(
#             nn.Conv2d(8, 12, 1, bias=False),  # RF: 6
#             nn.ReLU(),
#         )

#         # Conv Block 2 - RF: 14
#         self.convblock4 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 10
#             nn.ReLU(),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 14
#             nn.ReLU(),
#         )

#         # Transition Block 2 - RF: 16
#         self.pool2 = nn.MaxPool2d(2, 2)  # RF: 16
#         self.convblock5 = nn.Sequential(
#             nn.Conv2d(12, 12, 1, bias=False),  # RF: 16
#             nn.ReLU(),
#         )

#         # Conv Block 3 - RF: 28
#         self.convblock6 = nn.Sequential(
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 20
#             nn.ReLU(),
#             nn.Conv2d(12, 12, 3, padding=1, bias=False),  # RF: 24
#             nn.ReLU(),
#             nn.Conv2d(12, 10, 3, padding=1, bias=False),  # RF: 28
#             nn.ReLU(),
#         )

#         self.gap = nn.AdaptiveAvgPool2d(1)

#     def forward(self, x):
#         x = self.convblock1(x)
#         x = self.convblock2(x)
#         x = self.pool1(x)
#         x = self.convblock3(x)
#         x = self.convblock4(x)
#         x = self.pool2(x)
#         x = self.convblock5(x)
#         x = self.convblock6(x)
#         x = self.gap(x)
#         x = x.view(-1, 10)
#         return F.log_softmax(x, dim=-1)
    

       



# Model Params


In [9]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
# model = Net().to(device)
model = models.Model_1_Simple_CNN_Architecture().to(device)
summary(model, input_size=(1, 28, 28))

cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              72
              ReLU-2            [-1, 8, 28, 28]               0
            Conv2d-3            [-1, 8, 28, 28]             576
              ReLU-4            [-1, 8, 28, 28]               0
         MaxPool2d-5            [-1, 8, 14, 14]               0
            Conv2d-6           [-1, 12, 14, 14]              96
              ReLU-7           [-1, 12, 14, 14]               0
            Conv2d-8           [-1, 12, 14, 14]           1,296
              ReLU-9           [-1, 12, 14, 14]               0
           Conv2d-10           [-1, 12, 14, 14]           1,296
             ReLU-11           [-1, 12, 14, 14]               0
        MaxPool2d-12             [-1, 12, 7, 7]               0
           Conv2d-13             [-1, 12, 7, 7]             144
             ReLU-14             [-


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Training and Testing

Looking at logs can be boring, so we'll introduce **tqdm** progressbar to get cooler logs.

Let's write train and test functions

In [10]:
model = models.Model_1_Simple_CNN_Architecture().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4) # defining the optimizer with leraning rate of 0.01
# scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
#     optimizer,
#     T_0=4,  # Initial restart interval
#     T_mult=1,  # Multiplier for restart interval
#     eta_min=1e-6  # Minimum learning rate
# )

# Using the OneCycleLR scheduler for dynamic learning rate adjustment.
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,  # Maximum learning rate during the cycle.
    epochs=15,  # Total number of epochs for training.
    steps_per_epoch=len(train_loader),  # Number of steps in one epoch (based on train loader size).
    pct_start=0.2,  # Percentage of the cycle for increasing the learning rate.
    div_factor=10,  # Factor by which the initial learning rate is divided from max_lr.
    final_div_factor=100,  # Factor by which the learning rate is reduced at the end of the cycle.
)



In [11]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Update pbar-tqdm

    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

In [12]:
EPOCHS = 15
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, epoch)
    
    test(model, device, test_loader)
    # scheduler.step() # If in case of Cosine Annealing Warm, or based on other scheduler, scheduler.step.() has to uncommented. 
    

EPOCH: 0


Loss=0.6444212794303894 Batch_id=937 Accuracy=63.52: 100%|██████████| 938/938 [00:40<00:00, 22.97it/s] 



Test set: Average loss: 0.7615, Accuracy: 7355/10000 (73.55%)

EPOCH: 1


Loss=0.7068154215812683 Batch_id=937 Accuracy=75.58: 100%|██████████| 938/938 [00:40<00:00, 23.28it/s] 



Test set: Average loss: 0.6018, Accuracy: 7711/10000 (77.11%)

EPOCH: 2


Loss=0.6498732566833496 Batch_id=937 Accuracy=77.66: 100%|██████████| 938/938 [00:40<00:00, 23.33it/s] 



Test set: Average loss: 0.5631, Accuracy: 7800/10000 (78.00%)

EPOCH: 3


Loss=0.3545251488685608 Batch_id=937 Accuracy=78.33: 100%|██████████| 938/938 [00:40<00:00, 23.31it/s] 



Test set: Average loss: 0.5480, Accuracy: 7832/10000 (78.32%)

EPOCH: 4


Loss=0.41919001936912537 Batch_id=937 Accuracy=78.41: 100%|██████████| 938/938 [00:39<00:00, 23.64it/s]



Test set: Average loss: 0.5392, Accuracy: 7852/10000 (78.52%)

EPOCH: 5


Loss=0.3848263919353485 Batch_id=937 Accuracy=78.52: 100%|██████████| 938/938 [00:40<00:00, 23.30it/s] 



Test set: Average loss: 0.5389, Accuracy: 7828/10000 (78.28%)

EPOCH: 6


Loss=0.6066715717315674 Batch_id=937 Accuracy=78.72: 100%|██████████| 938/938 [00:39<00:00, 23.95it/s] 



Test set: Average loss: 0.5222, Accuracy: 7865/10000 (78.65%)

EPOCH: 7


Loss=0.30118823051452637 Batch_id=937 Accuracy=78.89: 100%|██████████| 938/938 [00:39<00:00, 23.54it/s]



Test set: Average loss: 0.5159, Accuracy: 7882/10000 (78.82%)

EPOCH: 8


Loss=0.629810094833374 Batch_id=937 Accuracy=79.17: 100%|██████████| 938/938 [00:39<00:00, 23.83it/s]  



Test set: Average loss: 0.5134, Accuracy: 7880/10000 (78.80%)

EPOCH: 9


Loss=0.30702367424964905 Batch_id=937 Accuracy=79.29: 100%|██████████| 938/938 [00:39<00:00, 23.74it/s]



Test set: Average loss: 0.5041, Accuracy: 7915/10000 (79.15%)

EPOCH: 10


Loss=0.2982425391674042 Batch_id=937 Accuracy=79.54: 100%|██████████| 938/938 [00:39<00:00, 23.56it/s] 



Test set: Average loss: 0.5082, Accuracy: 7903/10000 (79.03%)

EPOCH: 11


Loss=0.43372538685798645 Batch_id=937 Accuracy=79.64: 100%|██████████| 938/938 [00:39<00:00, 23.54it/s]



Test set: Average loss: 0.5019, Accuracy: 7913/10000 (79.13%)

EPOCH: 12


Loss=0.5799878239631653 Batch_id=937 Accuracy=79.86: 100%|██████████| 938/938 [00:37<00:00, 25.23it/s] 



Test set: Average loss: 0.4992, Accuracy: 7913/10000 (79.13%)

EPOCH: 13


Loss=0.363961398601532 Batch_id=937 Accuracy=79.98: 100%|██████████| 938/938 [00:37<00:00, 24.70it/s]  



Test set: Average loss: 0.5006, Accuracy: 7929/10000 (79.29%)

EPOCH: 14


Loss=0.5763484835624695 Batch_id=937 Accuracy=80.05: 100%|██████████| 938/938 [00:39<00:00, 23.99it/s] 



Test set: Average loss: 0.4999, Accuracy: 7929/10000 (79.29%)

