[Reference](https://medium.com/@francescofranco_39234/using-l1-l2-and-elasticnet-regularization-with-pytorch-633e79b863e0)

In [1]:
import os
import torch
from torch import nn
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torchvision import transforms

class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Flatten(),
      nn.Linear(28 * 28 * 1, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 10)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

  def compute_l1_loss(self, w):
      return torch.abs(w).sum()


if __name__ == '__main__':

  # Set fixed random number seed
  torch.manual_seed(42)

  # Prepare CIFAR-10 dataset
  dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
  trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)

  # Initialize the MLP
  mlp = MLP()

  # Define the loss function and optimizer
  loss_function = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

  # Run the training loop
  for epoch in range(0, 5): # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):

      # Get inputs
      inputs, targets = data

      # Zero the gradients
      optimizer.zero_grad()

      # Perform forward pass
      outputs = mlp(inputs)

      # Compute loss
      loss = loss_function(outputs, targets)

      # Compute L1 loss component
      l1_weight = 1.0
      l1_parameters = []
      for parameter in mlp.parameters():
          l1_parameters.append(parameter.view(-1))
      l1 = l1_weight * mlp.compute_l1_loss(torch.cat(l1_parameters))

      # Add L1 loss component
      loss += l1

      # Perform backward pass
      loss.backward()

      # Perform optimization
      optimizer.step()

      # Print statistics
      minibatch_loss = loss.item()
      if i % 500 == 499:
          print('Loss after mini-batch %5d: %.5f (of which %.5f L1 loss)' %
                (i + 1, minibatch_loss, l1))
          current_loss = 0.0

  # Process is complete.
  print('Training process has finished.')

100%|██████████| 9.91M/9.91M [00:01<00:00, 6.08MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 160kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.30MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 6.27MB/s]

Starting epoch 1





Loss after mini-batch   500: 65.46807 (of which 63.18528 L1 loss)
Loss after mini-batch  1000: 13.96880 (of which 11.65321 L1 loss)
Loss after mini-batch  1500: 3.68697 (of which 1.38204 L1 loss)
Loss after mini-batch  2000: 2.94059 (of which 0.63800 L1 loss)
Loss after mini-batch  2500: 2.93674 (of which 0.63414 L1 loss)
Loss after mini-batch  3000: 2.94184 (of which 0.63925 L1 loss)
Loss after mini-batch  3500: 2.93983 (of which 0.63725 L1 loss)
Loss after mini-batch  4000: 2.93638 (of which 0.63378 L1 loss)
Loss after mini-batch  4500: 2.93949 (of which 0.63691 L1 loss)
Loss after mini-batch  5000: 2.93907 (of which 0.63648 L1 loss)
Loss after mini-batch  5500: 2.93750 (of which 0.63491 L1 loss)
Loss after mini-batch  6000: 2.93818 (of which 0.63559 L1 loss)
Starting epoch 2
Loss after mini-batch   500: 2.94051 (of which 0.63793 L1 loss)
Loss after mini-batch  1000: 2.93816 (of which 0.63558 L1 loss)
Loss after mini-batch  1500: 2.94031 (of which 0.63773 L1 loss)
Loss after mini-bat

In [2]:
import os
import torch
from torch import nn
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torchvision import transforms

class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Flatten(),
      nn.Linear(28 * 28 * 1, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 10)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

  def compute_l2_loss(self, w):
      return torch.square(w).sum()


if __name__ == '__main__':

  # Set fixed random number seed
  torch.manual_seed(42)

  # Prepare CIFAR-10 dataset
  dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
  trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)

  # Initialize the MLP
  mlp = MLP()

  # Define the loss function and optimizer
  loss_function = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

  # Run the training loop
  for epoch in range(0, 5): # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):

      # Get inputs
      inputs, targets = data

      # Zero the gradients
      optimizer.zero_grad()

      # Perform forward pass
      outputs = mlp(inputs)

      # Compute loss
      loss = loss_function(outputs, targets)

      # Compute l2 loss component
      l2_weight = 1.0
      l2_parameters = []
      for parameter in mlp.parameters():
          l2_parameters.append(parameter.view(-1))
      l2 = l2_weight * mlp.compute_l2_loss(torch.cat(l2_parameters))

      # Add l2 loss component
      loss += l2

      # Perform backward pass
      loss.backward()

      # Perform optimization
      optimizer.step()

      # Print statistics
      minibatch_loss = loss.item()
      if i % 500 == 499:
          print('Loss after mini-batch %5d: %.5f (of which %.5f l2 loss)' %
                (i + 1, minibatch_loss, l2))
          current_loss = 0.0

  # Process is complete.
  print('Training process has finished.')

Starting epoch 1
Loss after mini-batch   500: 6.90672 (of which 4.62813 l2 loss)
Loss after mini-batch  1000: 3.57362 (of which 1.24972 l2 loss)
Loss after mini-batch  1500: 2.59496 (of which 0.29211 l2 loss)
Loss after mini-batch  2000: 2.35297 (of which 0.05843 l2 loss)
Loss after mini-batch  2500: 2.30740 (of which 0.00982 l2 loss)
Loss after mini-batch  3000: 2.30312 (of which 0.00131 l2 loss)
Loss after mini-batch  3500: 2.30289 (of which 0.00020 l2 loss)
Loss after mini-batch  4000: 2.30143 (of which 0.00008 l2 loss)
Loss after mini-batch  4500: 2.30402 (of which 0.00007 l2 loss)
Loss after mini-batch  5000: 2.30044 (of which 0.00007 l2 loss)
Loss after mini-batch  5500: 2.30264 (of which 0.00006 l2 loss)
Loss after mini-batch  6000: 2.30348 (of which 0.00009 l2 loss)
Starting epoch 2
Loss after mini-batch   500: 2.30326 (of which 0.00006 l2 loss)
Loss after mini-batch  1000: 2.30304 (of which 0.00006 l2 loss)
Loss after mini-batch  1500: 2.30264 (of which 0.00004 l2 loss)
Loss a

In [3]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Flatten(),
      nn.Linear(28 * 28 * 1, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 10)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

  def compute_l1_loss(self, w):
      return torch.abs(w).sum()

  def compute_l2_loss(self, w):
      return torch.square(w).sum()


if __name__ == '__main__':

  # Set fixed random number seed
  torch.manual_seed(42)

  # Prepare CIFAR-10 dataset
  dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
  trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)

  # Initialize the MLP
  mlp = MLP()

  # Define the loss function and optimizer
  loss_function = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

  # Run the training loop
  for epoch in range(0, 5): # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):

      # Get inputs
      inputs, targets = data

      # Zero the gradients
      optimizer.zero_grad()

      # Perform forward pass
      outputs = mlp(inputs)

      # Compute loss
      loss = loss_function(outputs, targets)

      # Specify L1 and L2 weights
      l1_weight = 0.3
      l2_weight = 0.7

      # Compute L1 and L2 loss component
      parameters = []
      for parameter in mlp.parameters():
          parameters.append(parameter.view(-1))
      l1 = l1_weight * mlp.compute_l1_loss(torch.cat(parameters))
      l2 = l2_weight * mlp.compute_l2_loss(torch.cat(parameters))

      # Add L1 and L2 loss components
      loss += l1
      loss += l2

      # Perform backward pass
      loss.backward()

      # Perform optimization
      optimizer.step()

      # Print statistics
      minibatch_loss = loss.item()
      if i % 500 == 499:
          print('Loss after mini-batch %5d: %.5f (of which %.5f L1 loss; %0.5f L2 loss)' %
                (i + 1, minibatch_loss, l1, l2))
          current_loss = 0.0

  # Process is complete.
  print('Training process has finished.')

Starting epoch 1
Loss after mini-batch   500: 24.79406 (of which 19.77073 L1 loss; 2.74106 L2 loss)
Loss after mini-batch  1000: 7.52531 (of which 4.81137 L1 loss; 0.39591 L2 loss)
Loss after mini-batch  1500: 3.03654 (of which 0.69882 L1 loss; 0.03131 L2 loss)
Loss after mini-batch  2000: 2.48222 (of which 0.17963 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  2500: 2.48142 (of which 0.17883 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  3000: 2.48055 (of which 0.17796 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  3500: 2.47980 (of which 0.17721 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  4000: 2.47945 (of which 0.17687 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  4500: 2.47878 (of which 0.17619 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  5000: 2.47902 (of which 0.17643 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  5500: 2.47924 (of which 0.17665 L1 loss; 0.00001 L2 loss)
Loss after mini-batch  6000: 2.48023 (of which 0.17764 L1 loss; 0.00001 L2 loss)
Starting 