<a href="https://colab.research.google.com/github/Bitdribble/dlwpt-code/blob/master/colab/PyTorchCh8_Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

torch.set_printoptions(edgeitems=2, linewidth=75)
torch.manual_seed(123)

<torch._C.Generator at 0x7f01bb90c610>

In [None]:
# Data preparation
data_path = '.'
cifar10 = datasets.CIFAR10(data_path, train=True, download=True)
cifar10_val = datasets.CIFAR10(data_path, train=False, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./cifar-10-python.tar.gz to .
Files already downloaded and verified


In [None]:
# Normalize data
transformed_cifar10 = datasets.CIFAR10(
    data_path, train=True, download=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))
transformed_cifar10_val = datasets.CIFAR10(
    data_path, train=False, download=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

In [None]:
# Restrict data to airplanes and birds
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

cifar2 = [(img, label_map[label]) for img, label in transformed_cifar10 if label in [0, 2]]
cifar2_val = [(img, label_map[label]) for img, label in transformed_cifar10_val if label in [0, 2]]

In [None]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Training on device {device}.")

Training on device cpu.


In [None]:
def training_loop(n_epochs, device, optimizer, model, loss_fn, train_loader, log_epochs=0):
  for epoch in range(1, n_epochs + 1):
    loss_train = 0.0

    for imgs, labels in train_loader:
      imgs = imgs.to(device=device)
      labels = labels.to(device=device)

      outputs = model(imgs)
      loss = loss_fn(outputs, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      loss_train += loss.item()

    if log_epochs is not 0 and ((epoch+1) % log_epochs == 0 or (epoch+1) == n_epochs):
      print(f"{datetime.datetime.now()} Epoch {epoch+1}, "
            f"Training loss {loss_train / len(train_loader):.3f}")

def validate(model, device, train_loader, val_loader):
  for name, loader in [("train", train_loader), ("val", val_loader)]:
    correct = 0
    total = 0
    with torch.no_grad(): 
      for imgs, labels in loader:
        imgs = imgs.to(device=device)
        labels = labels.to(device=device)

        outputs = model(imgs)
        _, predicted = torch.max(outputs, dim=1) 

        total += labels.shape[0]
        correct += int((predicted == labels).sum())

    print(f"Accuracy {name}: {correct / total:.2f}")

In [None]:
class NetWidth(nn.Module):
  def __init__(self, n_chans1):
    super().__init__()
    self.n_chans1 = n_chans1
    self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
    self.act1 = nn.Tanh()

    self.pool1 = nn.MaxPool2d(2)
    self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
    self.act2 = nn.Tanh()

    self.pool2 = nn.MaxPool2d(2)
    self.fc1 = nn.Linear(8*8*(n_chans1//2), 32)
    self.act3 = nn.Tanh()

    self.fc2 = nn.Linear(32, 2)

  def forward(self, x):
    out = self.pool1(self.act1(self.conv1(x)))
    out = self.pool2(self.act2(self.conv2(out)))
    out = out.view(-1, 8*8*(self.n_chans1//2)) # In place of nn.Flatten()
    out = self.act3(self.fc1(out))
    out = self.fc2(out)
    return out

In [None]:
# Helping our model to converge and generalize: Regularization
#
# The first way to regularize (i.e., stabilize) the model is to add a regularization term to the loss. This
# term is crafted so that the weights of the model tend to be small on their own, limiting
# how much training makes them grow. In other words, it is a penalty on larger weight
# values. This makes the loss have a smoother topography, and there’s relatively less to
# gain from fitting individual samples.
#
# The most popular regularization terms of this kind are L2 regularization, which is
# the sum of squares of all weights in the model, and L1 regularization, which is the sum
# of  the  absolute  values  of  all  weights  in  the  model.  Both  of  them  are  scaled  by  a
# (small) factor, which is a hyperparameter we set prior to training.
#
# L2 regularization is also referred to as weight decay. The reason for this name is that,
# thinking about SGD and backpropagation, the negative gradient of the L2 regulariza-
# tion term with respect to a parameter w_i is - 2 * lambda * w_i, where lambda is the
# aforementioned hyperparameter, simply named weight decay in PyTorch. So, adding L2
# regularization  to  the  loss  function  is  equivalent  to  decreasing  each  weight  by  an
# amount  proportional  to  its  current  value  during  the  optimization  step  (hence,  the
# name  weight decay).  Note  that weight decay applies to all  parameters of the network,
# such as biases.
#
# In PyTorch, we could implement regularization pretty easily by adding a term to
# the  loss.  After  computing  the  loss,  whatever  the  loss  function  is,  we  can  iterate  the
# parameters  of  the  model,  sum  their  respective  square  (for  L2)  or  abs  (for  L1),  and
# backpropagate:
def training_loop_l2reg(n_epochs, device, optimizer, model, loss_fn, train_loader, log_epochs=0):
  for epoch in range(1, n_epochs + 1):
    loss_train = 0.0

    for imgs, labels in train_loader:
      imgs = imgs.to(device=device)
      labels = labels.to(device=device)

      outputs = model(imgs)
      loss = loss_fn(outputs, labels)

      l2_lambda = 0.001
      l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
      loss = loss + l2_lambda * l2_norm # L2 regularization

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      loss_train += loss.item()

    if log_epochs is not 0 and ((epoch+1) % log_epochs == 0 or (epoch+1) == n_epochs):
      print(f"{datetime.datetime.now()} Epoch {epoch+1}, "
            f"Training loss {loss_train / len(train_loader):.3f}")

In [None]:
# The SGD optimizer in PyTorch already has a weight_decay parameter that
# corresponds to 2 * lambda, and it directly performs weight decay during the update
# as described previously.
model = NetWidth(n_chans1=8).to(device=device)
optimizer = optim.SGD(model.parameters(), lr=1e-2, weight_decay=1e-3)
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,
                                           shuffle=True)

loss_fn = nn.CrossEntropyLoss()
training_loop(
    n_epochs = 100,
    device=device,
    optimizer = optimizer,
    model = model,

    loss_fn = loss_fn,
    train_loader = train_loader,
    log_epochs = 10
)

2021-12-20 03:08:17.685341 Epoch 10, Training loss 0.384
2021-12-20 03:08:42.828568 Epoch 20, Training loss 0.316
2021-12-20 03:09:07.945721 Epoch 30, Training loss 0.294
2021-12-20 03:09:32.580693 Epoch 40, Training loss 0.281
2021-12-20 03:09:58.466756 Epoch 50, Training loss 0.270
2021-12-20 03:10:23.113875 Epoch 60, Training loss 0.260
2021-12-20 03:10:47.724807 Epoch 70, Training loss 0.252
2021-12-20 03:11:12.325935 Epoch 80, Training loss 0.247
2021-12-20 03:11:36.947601 Epoch 90, Training loss 0.239
2021-12-20 03:12:01.717532 Epoch 100, Training loss 0.233


In [None]:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)

validate(model, device, train_loader, val_loader)

Accuracy train: 0.90
Accuracy val: 0.87
