In [69]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
import torch
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from google.colab import drive
warnings.filterwarnings("ignore")
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [71]:
train_data = MNIST(root='./data', train=True, download=True, transform=transform)
test_data = MNIST(root='./data', train=False, download=True, transform=transform)

In [140]:
VALIDATION = 0.2
BATCH_SIZE = 64
EPOCHS = 10

In [73]:
idx = np.arange(len(train_data))
np.random.shuffle(idx)
split = int(np.floor(VALIDATION * len(train_data)))
train_idx, validation_idx = idx[split:], idx[:split]

train_sample = torch.utils.data.sampler.SubsetRandomSampler(train_idx)
validation_sample = torch.utils.data.sampler.SubsetRandomSampler(validation_idx)

# when use sampler the shuffle is ignored
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=train_sample)
validation_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=validation_sample)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

In [74]:
for data, target in train_loader:
    print(data.shape, target.shape)
    break

torch.Size([64, 1, 28, 28]) torch.Size([64])


# Batch Normalization:
Added Batch Normalization after the linear but before the non linear activation function

In [125]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(784, 256)
    self.bn1 = nn.BatchNorm1d(256)

    self.fc2 = nn.Linear(256, 64)
    self.bn2 = nn.BatchNorm1d(64)

    self.fc3 = nn.Linear(64, 32)
    self.bn3 = nn.BatchNorm1d(32)

    self.fc4 = nn.Linear(32, 10)

    self.Droupout = nn.Dropout(0.2)

  def forward(self, x):
    x = x.view(x.shape[0], -1)
    x = self.Droupout(F.relu(self.bn1(self.fc1(x))))
    x = self.Droupout(F.relu(self.bn2(self.fc2(x))))
    x = self.Droupout(F.relu(self.bn3(self.fc3(x))))

    x = self.fc4(x)
    return x

model = Model()

In [130]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# Learning Rate Scheduler:
Here we will use the StepLR i.e. Step Learning Rate scheduler. Below is the reference and example from Pytorch doc:

- torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.1, last_epoch=-1)
Sets the learning rate of each parameter group to the initial lr decayed by gamma every step_size epochs.

Parameters:

optimizer (Optimizer) – Wrapped optimizer.
step_size (int) – Period of learning rate decay.
gamma (float) – Multiplicative factor of learning rate decay. Default: 0.1.
last_epoch (int) – The index of last epoch. Default: -1.
Example: scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

Assuming optimizer uses lr = 0.05 for all groups
lr = 0.05 if epoch < 5
lr = 0.005 if 5 <= epoch < 10
lr = 0.0005 if 10 <= epoch < 15
Examples describes that the initial learning rate defined in the optimizer step was 0.05 , which will be reducuded after every 5 epochs.Learning Rate will be decreased with the multiplication factor of 0.1(i.e. the value defined in gamma).



In [139]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

In [133]:
print(torch.cuda.is_available())

True


In [134]:
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [135]:
model.to(device)

Model(
  (fc1): Linear(in_features=784, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=256, out_features=64, bias=True)
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (bn3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=32, out_features=10, bias=True)
  (Droupout): Dropout(p=0.2, inplace=False)
)

In [141]:
for epoch in range(EPOCHS):
  train_loss, valid_loss = [], []

  model.train()

  scheduler.step()

  for data, target in train_loader:

    data, target = data.to(device), target.to(device)

    data = data.view(data.shape[0], -1)

    optimizer.zero_grad()

    output = model(data)

    loss_value = loss(output, target)

    loss_value.backward()

    optimizer.step()

    train_loss.append(loss_value.item())

  print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss))

Epoch: 0 Training Loss:  0.06910441623752316
Epoch: 1 Training Loss:  0.07150576159668466
Epoch: 2 Training Loss:  0.06944089407784243
Epoch: 3 Training Loss:  0.0692833774096022
Epoch: 4 Training Loss:  0.07216225838040312
Epoch: 5 Training Loss:  0.07104084116530915
Epoch: 6 Training Loss:  0.07223497697214286
Epoch: 7 Training Loss:  0.07058095913007856
Epoch: 8 Training Loss:  0.07066613508760929
Epoch: 9 Training Loss:  0.07002611597379049
