# CENG501 - PROJECT
# FROM LABEL SMOOTHING TO LABEL RELAXATION
Simple 2-Layer architecture
Lienen, J.; Hüllermeier, E. 2021. From Label Smoothing to Label Relaxation. Proceedings of the AAAI Conference on Artificial Intelligence, 35(10), 8583-8591. Retrieved from https://ojs.aaai.org/index.php/AAAI/article/view/17041


## 1 Import the Modules

Here, we import some libraries that we will use throughout the implementation.

In [None]:
import matplotlib.pyplot as plt # For plotting
import numpy as np              # NumPy, for working with arrays/tensors 
import time                     # For measuring time

# PyTorch libraries:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

import tensorflow as tf

import keras
from keras import datasets
from sklearn.model_selection import train_test_split

### 1.1 Enable GPU

From "Edit -> Notebook Settings -> Hardware accelerator" select GPU. With the following we will specify to PyTorch that we want to use the GPU.

In [None]:
if torch.cuda.is_available():
  print("Cuda (GPU support) is available and enabled!")
  device = torch.device("cuda")
else:
  print("Cuda (GPU support) is not available :(")
  device = torch.device("cpu")

Cuda (GPU support) is available and enabled!


## 2 The Dataset

We will use Keras datasets to download the MNIST dataset. 

In [None]:
batch_size = 64

(x_train_ini, y_train_ini), (x_test_ini, y_test_ini) = datasets.mnist.load_data()


x_train_ini = np.expand_dims(x_train_ini, axis=-1)
x_test_ini = np.expand_dims(x_test_ini, axis=-1)


x_test_ratio = 1/7
x_val_ratio = 1/6

mnist_x = np.concatenate((x_train_ini,  x_test_ini))
mnist_y = np.concatenate((y_train_ini,  y_test_ini))

x_train_val, x_test, y_train_val, y_test = train_test_split(mnist_x, mnist_y, test_size=x_test_ratio, random_state=0, shuffle=True)


x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=x_val_ratio, random_state=0, shuffle=False)

#### preparing data for experiments ####################

x_train = x_train.astype('float32') / 255.
x_train = x_train.reshape(-1, 28*28)      
pixel_mean_train = np.mean(x_train, axis=0)
x_train -= pixel_mean_train # substracting mean

x_test = x_test.astype('float32') / 255.
x_test = x_test.reshape(-1, 28*28)      
pixel_mean_test = np.mean(x_test, axis=0)
x_test -= pixel_mean_test # substracting mean

x_val = x_val.astype('float32') / 255.
x_val = x_val.reshape(-1, 28*28)      
pixel_mean_val = np.mean(x_val, axis=0)
x_val -= pixel_mean_val # substracting mean

# combining training and validation sets to be used in the final training

x_train_val = x_train_val.astype('float32') / 255.
x_train_val = x_train_val.reshape(-1, 28*28)      
pixel_mean_trainval = np.mean(x_train_val, axis=0)
x_train_val -= pixel_mean_trainval # substracting mean

## 3 Label Relaxation Loss Definition

In [None]:
class Label_Relaxation(nn.Module):

  def __init__(self, alpha=0.5, num_classes=10, toll = 0.05 ):
      super(Label_Relaxation, self).__init__()

      # define parameters:

      self.alpha = alpha
      self.num_classes= num_classes
      self.toll = toll

  def forward(self, y_hat, y): # scores, targets

    y = F.one_hot(y, self.num_classes ) # one-hot encoding of the targets.

    #Note: y_hat represents the scores without softmax. (to be applied in the loss function)
    y_hat =  y_hat.softmax(-1) # convert scores to prop. with softmax.
   
    with torch.no_grad():
      pr1 = torch.ones_like(y) - self.alpha
      pr2 = self.alpha * y_hat / torch.unsqueeze(torch.sum((torch.ones_like(y) - y) * y_hat, dim = -1), dim = -1)  
      pr = torch.where( self.toll < y , pr1, pr2)

                      
    # Kullback-Leibler divergence
    D_kl = torch.sum(F.kl_div(y_hat.log(), pr, size_average=None, reduce=None, reduction='none', log_target=False), dim =-1)      

    y_hat = torch.sum(y*y_hat, dim =-1)

    out_loss = torch.mean( torch.where( torch.gt(y_hat, 1.0 - self.alpha), torch.zeros_like(D_kl), D_kl ) )

    return  out_loss

## 4 Define Model



### 4.1 Model Definition

Our simple dense model is composed of two hidden layer with ReLu activation. It is inspired from Pytorch examples.

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TwoLayerNet(nn.Module):
    def __init__(self, D, H, C):
        super(TwoLayerNet, self).__init__()
        torch.manual_seed(501)        
        
        self.fc1 = nn.Linear(D, H)
        self.fc2 = nn.Linear(H, C)
        self.Relu = nn.ReLU()
        
    def forward(self, x):
      
      x = self.fc1(x)
      
      x = self.Relu(x)
      x = self.fc2(x)
      
      return x

### 4.2 Define criterion and the Optimizer



In [None]:

D = 28*28 # dimensionality
C = 10 # num of classes
H = 1024 # number of hidden neurons


" for Cross_Entropy: criterion = nn.CrossEntropyLoss() , where alpha=0 "
" for Label_Smoothing: criterion = nn.CrossEntropyLoss(label_smoothing = alpha) , where alpha= 0.0 ~ 1.0 "
" for Label_Relaxation: criterion = Label_Relaxation(alpha = 0.0 ~ 1.0, num_classes = C) "
# create an instance
model = TwoLayerNet(D, H, C)
criterion = Label_Relaxation(alpha = 0.1, num_classes = C)
optimizer = optim.SGD(model.parameters(), lr=5e-2, weight_decay=1e-5,  momentum = 0.9) 

### 4.3 Define Trainer with batch function


In [None]:
def sample_batch(X, y, batch_size): # sampling (creating batches)
  """Get a random batch of size batch_size from (X, y)."""
  batch_indices = np.random.choice(range(X.shape[0]), size=batch_size)
  X_batch = X[batch_indices]
  y_batch = y[batch_indices]

  return X_batch, y_batch

def train(model, criterion, optimizer, epochs, x, y , batch_size, verbose=True): # defining trainer
  
  loss_history = []
  num_train, dim1 = x.shape
  
  for epoch in range(epochs):
    for it in range(int(num_train/batch_size)): 
        
      # Get a batch of samples
      inputs, labels = sample_batch(x, y, batch_size)
      
      inputs = torch.Tensor(inputs)
     
      labels = torch.LongTensor(labels)
      
      labels = labels.reshape(batch_size).to(device)
      
      inputs = inputs.to(device)
      labels = labels.to(device)

      # zero the gradients as PyTorch accumulates them
      optimizer.zero_grad()

      # Obtain the scores
      outputs = model(inputs)

      # Calculate loss
      loss = criterion(outputs.to(device), labels)

      # Backpropagate
      loss.backward()

      # Update the weights
      optimizer.step()

      loss_history.append(loss.item())
    
    if verbose: print(f'Epoch {epoch} / {epochs}:  loss of last iteration {np.sum(loss_history[-1])}')
    
  return loss_history

### 5 ECE Loss Definition 'Expected Calibration Error (ECE)'
The Expected Calibration Error (ECE) loss function is copied from https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py


In [None]:
import torch
from torch import nn, optim
from torch.nn import functional as F

class ECELoss(nn.Module):
    
    def __init__(self, n_bins=15 ):
        """
        n_bins (int): number of confidence interval bins
        """
        super(ECELoss, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels):
        softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)

        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece


### 5.1 Creating a training instance
For temperature scaling, to find the optimal temperature, we fix the hyperparameters with initial values and tune the model for T

In [None]:
# fixing hyperparameters e.g.
lr = 0.05
alpha = 0.2
epochs = 25


D = 28*28 # dimensionality
C = 10 # num of classes
H = 1024 # number of hidden neurons

model = TwoLayerNet(D, H, C)

criterion = Label_Relaxation(alpha = alpha, num_classes =C)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5,  momentum = 0.9) 

model = model.to(device)

loss_history = train(model, criterion, optimizer, epochs, x_train, y_train, batch_size)

### 5.2 Tuning model for Temperature Scaling (temperature calibration)

In [None]:
from sklearn.metrics import accuracy_score # import  sklearn.metrics to calculate the accuracy scores

Temp_Scaling = True 

if Temp_Scaling:
  T = [0.25, 0.5, 0.75, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.5, 3.0]
else: 
  T = [1.] # in case of no temperature scaling just fix T to 1.


results_a = {}
best_val_a = -1   
best_lr_a = None  
best_t_a = None  


results_e = {}
best_val_e = 10**6   
best_lr_e = None  
best_t_e = None  

num_val, dim1 = x_val.shape

for temperature in T:
    

  with torch.no_grad():

      inputs = x_val
      labels = y_val
        
      inputs = torch.Tensor(inputs)
      labels = torch.LongTensor(labels)
      

      inputs = inputs.to(device)
      labels = labels.to(device)

      
        
      outputs = model(inputs)

      outputs = outputs/temperature

      # Calculate validation ECE error
      cal_model = ECELoss(n_bins = 15)
      ece = cal_model(outputs, labels)
      val_error = float(ece[0])
        
    
      # Calculate  validation accuracies
      _, predicted = torch.max(outputs.data, 1)  
      val_accuracy = accuracy_score(torch.Tensor.cpu(labels), torch.Tensor.cpu(predicted))
      

      print(f"learning rate={lr} and temperature={temperature} provided val_accuracy={val_accuracy:.5f}")
      print(f"learning rate={lr} and temperature={temperature} provided val_error={val_error:.5f}")
    
     # Save the results for accuracy
      results_a[(lr,temperature)] = (val_accuracy)
      if best_val_a < val_accuracy:
          best_lr_a = lr
          best_temperature_a = temperature
          best_val_a = val_accuracy
          best_model_a = model

      # Save the results for ECE
      results_e[(lr,temperature)] = (val_error)
      if best_val_e > val_error:
          best_lr_e = lr
          best_temperature_e = temperature
          best_val_e = val_error
          best_model_e = model
    
print(f'\nbest validation accuracy achieved during cross-validation: {best_val_a:.3f} with params temperature= {best_temperature_a} and lr={best_lr_a}')
print(f'\nbest validation error achieved during cross-validation: {best_val_e:.3f} with params temperature= {best_temperature_e} and lr={best_lr_e}')

### 5.3 Tuning other parameters with fixed temperature scale



In [None]:
from sklearn.metrics import accuracy_score

epochs = 25
temperature_a = best_temperature_a # opt. temperature for accuracy
temperature_e = best_temperature_e # opt. temperature for error


lr_range = [0.05, 0.05*(0.3), 0.05*(0.3**2),  0.05*(0.3**3) ]

alpha_range = [0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4]


# model definition
D = 28*28 # dimensionality
C = 10 # num of classes
H = 1024 # number of hidden neurons

model = TwoLayerNet(D, H, C)

# creating storage for results
results_a = {}
best_val_a = -1   
best_lr_a = None  
best_t_a = None  


results_e = {}
best_val_e = 10**6   
best_lr_e = None  
best_t_e = None  

# validation set properities
num_val, dim1 = x_val.shape



for lr in lr_range:

  for alpha in alpha_range:
    

      inputs = x_val
      labels = y_val
        
      inputs = torch.Tensor(inputs)
      labels = torch.LongTensor(labels)
      #labels = labels.reshape(batch_size).to(device)

      inputs = inputs.to(device)
      labels = labels.to(device)

      ##################
      #criterion = nn.CrossEntropyLoss()
      criterion = Label_Relaxation(alpha = alpha, num_classes =10)
      optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5,  momentum = 0.9) 

      model = model.to(device)

      loss_history = train(model, criterion, optimizer, epochs, x_train, y_train, batch_size)

      ################
        
      outputs = model(inputs)

      outputs_a = outputs/temperature_a
      outputs_e = outputs/temperature_e


      # Calculate validation ECE error
      cal_model = ECELoss(n_bins = 15)
      ece = cal_model(outputs_e, labels)
      val_error = float(ece[0])
        
    
      # Calculate  validation accuracies
      _, predicted = torch.max(outputs_a.data, 1)  
      val_accuracy = accuracy_score(torch.Tensor.cpu(labels), torch.Tensor.cpu(predicted))
      

      # print(f"learning rate={lr} and temperature={temperature} provided val_accuracy={val_accuracy:.5f}")
      # print(f"learning rate={lr} and temperature={temperature} provided val_error={val_error:.5f}")
    
     # Save the results for accuracy
      results_a[(lr,temperature)] = (val_accuracy)
      if best_val_a < val_accuracy:
          best_lr_a = lr
          best_alpha_a = alpha
          best_val_a = val_accuracy
          best_model_a = model

      # Save the results for ECE
      results_e[(lr,temperature)] = (val_error)
      if best_val_e > val_error:
          best_lr_e = lr
          best_alpha_e = alpha
          best_val_e = val_error
          best_model_e = model
    
print(f'\nbest validation accuracy achieved during cross-validation: {best_val_a:.3f} with params alpha= {best_alpha_a} and lr={best_lr_a}')
print(f'\nbest validation error achieved during cross-validation: {best_val_e:.3f} with params alpha= {best_alpha_e} and lr={best_lr_e}')

### 6 Final training of the model for accuracy

*Disclaimer: This code piece is taken from PyTorch examples.*

In [None]:

# training for the best hyperparameters  of  Accuracy type

lr = best_lr_a
alpha = best_alpha_a
epochs = 25
# temperature_a = best_temperature_a # opt. temperature for accuracy
# temperature_e = best_temperature_e # opt. temperature for error


D = 28*28 # dimensionality
C = 10 # num of classes
H = 1024 # number of hidden neurons

model = TwoLayerNet(D, H, C)


criterion = Label_Relaxation(alpha = alpha, num_classes =C)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5,  momentum = 0.9) 

model = model.to(device)

loss_history = train(model, criterion, optimizer, epochs, x_train_val, y_train_val, batch_size)

### 6.1 Calculating test accuracy/error with Accuracy

In [None]:
inputs = x_test
labels = y_test
        
inputs = torch.Tensor(inputs)
labels = torch.LongTensor(labels)

inputs = inputs.to(device)
labels = labels.to(device)

outputs = model(inputs)

outputs_a = outputs/temperature_a
outputs_e = outputs/temperature_a


# Calculate validation ECE error
cal_model = ECELoss(n_bins = 15)
ece = cal_model(outputs_e, labels)
test_error = float(ece[0])
        
# Calculate  validation accuracies
_, predicted = torch.max(outputs_a.data, 1)  
test_accuracy = accuracy_score(torch.Tensor.cpu(labels), torch.Tensor.cpu(predicted))
print(f'\nbest test accuracy achieved using Accuracy hyperparameters : {test_accuracy:.3f} with params alpha= {best_alpha_a} and lr={best_lr_a}')
print(f'\nbest test error achieved using Accuracy hyperparameters : {test_error:.3f} with params alpha= {best_alpha_a} and lr={best_lr_a}')

### 7 Final training of the model for ECE

In [None]:
# training for the best hyperparameters  of  ECE

lr = best_lr_e
alpha = best_alpha_e
epochs = 25

D = 28*28 # dimensionality
C = 10 # num of classes
H = 1024 # number of hidden neurons

model = TwoLayerNet(D, H, C)


criterion = Label_Relaxation(alpha = alpha, num_classes =C)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5,  momentum = 0.9) 

model = model.to(device)
loss_history = train(model, criterion, optimizer, epochs, x_train_val, y_train_val, batch_size)

### 7.1 Calculating test accuracy/error with ECE

In [None]:
inputs = x_test
labels = y_test
        
inputs = torch.Tensor(inputs)
labels = torch.LongTensor(labels)

inputs = inputs.to(device)
labels = labels.to(device)

outputs = model(inputs)

outputs_a = outputs/temperature_e
outputs_e = outputs/temperature_e


# Calculate validation ECE error
cal_model = ECELoss(n_bins = 15)
ece = cal_model(outputs_e, labels)
test_error = float(ece[0])
        
# Calculate  validation accuracies
_, predicted = torch.max(outputs_a.data, 1)  
test_accuracy = accuracy_score(torch.Tensor.cpu(labels), torch.Tensor.cpu(predicted))
print(f'\nbest test accuracy achieved using ECE hyperparameters : {test_accuracy:.3f} with params alpha= {best_alpha_e} and lr={best_lr_e}')
print(f'\nbest test error achieved using ECE hyperparameters : {test_error:.3f} with params alpha= {best_alpha_e} and lr={best_lr_e}')