In [3]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
import torch
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from google.colab import drive
warnings.filterwarnings("ignore")
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [5]:
train_data = MNIST(root='./data', train=True, download=True, transform=transform)
test_data = MNIST(root='./data', train=False, download=True, transform=transform)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 93.5MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 20.0MB/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 87.0MB/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz





Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 1.54MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [11]:
VALIDATION = 0.2
BATCH_SIZE = 64
EPOCHS = 10

In [8]:
idx = np.arange(len(train_data))
np.random.shuffle(idx)
split = int(np.floor(VALIDATION * len(train_data)))
train_idx, validation_idx = idx[split:], idx[:split]

train_sample = torch.utils.data.sampler.SubsetRandomSampler(train_idx)
validation_sample = torch.utils.data.sampler.SubsetRandomSampler(validation_idx)

# when use sampler the shuffle is ignored
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=train_sample)
validation_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=validation_sample)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

In [9]:
for data, target in train_loader:
    print(data.shape, target.shape)
    break

torch.Size([64, 1, 28, 28]) torch.Size([64])


# Without Dropout or Regularization

In [34]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(784, 256)
    self.fc2 = nn.Linear(256, 64)
    self.fc3 = nn.Linear(64, 32)
    self.fc4 = nn.Linear(32, 10)

  def forward(self, x):
    x = x.view(x.shape[0], -1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)
    return x

model = Model()

In [35]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [38]:
for epoch in range(EPOCHS):
  train_loss, valid_loss = [], []

  model.train()
  for data, target in train_loader:
    optimizer.zero_grad()
    output = model(data)
    loss_value = loss(output, target)
    loss_value.backward()
    optimizer.step()
    train_loss.append(loss_value.item())

  with torch.no_grad():
    model.eval()
    for data, target in validation_loader:
      output = model(data)
      loss_value = loss(output, target)
      valid_loss.append(loss_value.item())

print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 9 Training Loss:  0.04912998636617946 Valid Loss:  0.10182323734067936


# Dropout:
A simple but effective regularization technique where randomly selected neurons are ignored during training. They are “dropped-out” randomly. This means that their contribution to the activation of downstream neurons is temporally removed on the forward pass and any weight updates are not applied to the neuron on the backward pass. Dropout is again used to reduce the 'overfitting problem'. Drop is more useful when we have deep network. We give a dropout probablity(to switch off the weights randomly) in the configuration.

Dropout is generally used during the training phase only and we switch off dropout during test/validation phase.

In [39]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(784, 256)
    self.fc2 = nn.Linear(256, 64)
    self.fc3 = nn.Linear(64, 32)
    self.fc4 = nn.Linear(32, 10)

    self.dropout = nn.Dropout(0.2)

  def forward(self, x):
    x = x.view(x.shape[0], -1)

    x = self.dropout(F.relu(self.fc1(x)))
    x = self.dropout(F.relu(self.fc2(x)))
    x = self.dropout(F.relu(self.fc3(x)))

    x = self.fc4(x)
    return x

model = Model()

In [40]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [42]:
for epoch in range(EPOCHS):
  train_loss, valid_loss = [], []

  model.train()
  for data, target in train_loader:
    optimizer.zero_grad()
    output = model(data)
    loss_value = loss(output, target)
    loss_value.backward()
    optimizer.step()
    train_loss.append(loss_value.item())

  with torch.no_grad():
    model.eval()
    for data, target in validation_loader:
      output = model(data)
      loss_value = loss(output, target)
      valid_loss.append(loss_value.item())

  print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 0 Training Loss:  0.6221508249839147 Valid Loss:  0.25120305365070383
Epoch: 1 Training Loss:  0.2993074818352858 Valid Loss:  0.18277823303806337
Epoch: 2 Training Loss:  0.23591463672121366 Valid Loss:  0.16205026281680515
Epoch: 3 Training Loss:  0.20448531943808 Valid Loss:  0.13375360706692285
Epoch: 4 Training Loss:  0.18423983048150938 Valid Loss:  0.1379432445193859
Epoch: 5 Training Loss:  0.16925463514402508 Valid Loss:  0.11744712773175474
Epoch: 6 Training Loss:  0.15927540396526457 Valid Loss:  0.12152214043159434
Epoch: 7 Training Loss:  0.14927727395047743 Valid Loss:  0.10844078564030574
Epoch: 8 Training Loss:  0.13820888075853388 Valid Loss:  0.13125023245068385
Epoch: 9 Training Loss:  0.13492762167255085 Valid Loss:  0.10322175402580662


# Few Steps to note:

### **torch.no_grad()**: impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop. We generally don't want backpropagation in validation and test phase.
#### **model.eval()**: This will switch off the dropouts for validation phase.
### **model.train()**: Will bring the model again into traning phase by switching on the dropouts.
### If the loss of traning set and validation sets are very close that means there is less overfitting.

# Test the network

In [74]:
test_loss = 0
class_correct = [0 for i in range(10)]
class_total = [0 for i in range(10)]

model.eval()

for data, target in test_loader:

  output = model(data)

  loss_val = loss(output, target)

  # loss * BATCH_SIZE
  test_loss += loss_val.item()*data.size(0)

  _, pred = torch.max(output, 1)

  correct = pred.eq(target.data.view_as(pred))

  # except range(len(BATCH_SIZE)) will use range(len(target))
  # every batch has 64 data but the last batch may has less than it which will cause an Indexing Issue
  for i in range(len(target)):
    label = target.data[i]
    class_correct[label] += correct[i].item()
    class_total[label] += 1

# calculate and print avg test loss
test_loss = test_loss/len(test_loader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

Test Loss: 0.099662


Test Accuracy (Overall): 97% (9710/10000)


# Adding L2 Regularization to model
**optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01)**
##### You can specify the weight_decay lamda parameter values while defining the model optimizer.

##### Higher the value of weight_decay higher the shrinkage in the model weights.