# Label Smoothing



## $$ p_k = \frac{e^{x^T w_k} }{ \sum_l ^L e^{x^T w_l}}$$

where $p_k$ is the likelihood the model assigns to the $k$-th class, $w_k$ represents the weights and bias of the last layer, x is the input vector of the last layer.


We minimize the expected value of the cross-entropy between hard targets $y_k$ and $p_k$ as in


## $$H(\mathbf{y},\mathbf{p}) = \sum_k ^K -y_k log(p_k),$$

where $y_k$ is 1 of the correct class and 0 for the rest.



Targets via label smoothing


## $$ y_k = y_k (1- \alpha) + \frac{\alpha}{K}$$

# Multi-Class Classification

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class LabelSmoothingLossCanonical(nn.Module):
    def __init__(self, smoothing=0.0, dim=-1):
        super(LabelSmoothingLossCanonical, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.dim = dim
    def forward(self, pred, target):
        # Log softmax is used for numerical stability
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            true_dist += self.smoothing / pred.size(self.dim)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

# Label Relaxation

: TODO

In [3]:

class LabelRelaxationLoss(nn.Module):
    def __init__(self, alpha=0.1, dim=-1, logits_provided=True, one_hot_encode_trgts=True, num_classes=-1):
        super(LabelRelaxationLoss, self).__init__()
        self.alpha = alpha
        self.dim = dim

        # Greater zero threshold
        self.gz_threshold = 0.1

        self.logits_provided = logits_provided
        self.one_hot_encode_trgts = one_hot_encode_trgts

        self.num_classes = num_classes

    def forward(self, pred, target):
        if self.logits_provided:
            pred = pred.softmax(dim=self.dim)

        # with torch.no_grad():
        # Apply one-hot encoding to targets
        if self.one_hot_encode_trgts:
            target = F.one_hot(target, num_classes=self.num_classes)

        sum_y_hat_prime = torch.sum((torch.ones_like(target) - target) * pred, dim=-1)
        pred_hat = self.alpha * pred / torch.unsqueeze(sum_y_hat_prime, dim=-1)
        target_credal = torch.where(target > self.gz_threshold, torch.ones_like(target) - self.alpha, pred_hat)
        divergence = nn.functional.kl_div(pred.log(), target_credal, log_target=False)

        pred = torch.sum(pred * target, dim=-1)

        result = torch.where(torch.gt(pred, 1. - self.alpha), torch.zeros_like(divergence), divergence)
        return torch.mean(result)

# Dummy Multi-label Classification

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, Y = make_classification()
X
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [5]:
X_train.shape,y_train.shape

((67, 20), (67,))

# Affine Transformation

In [6]:
def eval_model(model,X,y):
    X = torch.FloatTensor(X)
    with torch.no_grad():
        preds=torch.sigmoid(model(X))
        return accuracy_score(torch.argmax(preds,axis=1).numpy(), y)
def train(X_train, X_test, y_train, y_test,label_smoothing=0.0):
    X_train_torch = torch.FloatTensor(X_train)
    y_train_torch = torch.tensor(y_train,dtype=torch.long)
    model = nn.Linear(20, 2) 
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
    for epoch in range(500):
        optimizer.zero_grad()
        output = torch.sigmoid(model(X_train_torch))
        loss = criterion(output, y_train_torch)
        loss.backward()
        optimizer.step()
        if epoch % 100 ==0:
            print('Loss: {:.3f}'.format(loss.item()))
    print(f'Train Accuracy:{eval_model(model,X_test,y_test):.3f}')
    
    


In [7]:
train(X_train, X_test, y_train, y_test)

Loss: 0.714
Loss: 0.478
Loss: 0.436
Loss: 0.415
Loss: 0.402
Train Accuracy:0.667


In [8]:
train(X_train, X_test, y_train, y_test,label_smoothing=.1)

Loss: 0.708
Loss: 0.505
Loss: 0.471
Loss: 0.454
Loss: 0.443
Train Accuracy:0.667


In [9]:
train(X_train, X_test, y_train, y_test,label_smoothing=.2)

Loss: 0.696
Loss: 0.539
Loss: 0.508
Loss: 0.493
Loss: 0.484
Train Accuracy:0.606


In [10]:
train(X_train, X_test, y_train, y_test,label_smoothing=.5)

Loss: 0.716
Loss: 0.631
Loss: 0.612
Loss: 0.603
Loss: 0.599
Train Accuracy:0.667


In [11]:
def train_lr(X_train, X_test, y_train, y_test,alpha=0.0):
    X_train_torch = torch.FloatTensor(X_train)
    y_train_torch = torch.tensor(y_train,dtype=torch.long)
    model = nn.Linear(20, 2) 
    criterion = LabelRelaxationLoss(alpha=alpha)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
    for epoch in range(500):
        optimizer.zero_grad()
        output = torch.sigmoid(model(X_train_torch))
        loss = criterion(output, y_train_torch)
        loss.backward()
        optimizer.step()
        if epoch % 100 ==0:
            print('Loss: {:.3f}'.format(loss.item()))
    print(f'Train Accuracy:{eval_model(model,X_test,y_test):.3f}')
    
    
train_lr(X_train, X_test, y_train, y_test)

Loss: 0.355
Loss: 0.267
Loss: 0.238




Loss: 0.224
Loss: 0.216
Train Accuracy:0.667


In [12]:
train_lr(X_train, X_test, y_train, y_test,alpha=.1)

Loss: 0.195
Loss: 0.123
Loss: 0.103
Loss: 0.095
Loss: 0.089
Train Accuracy:0.606


In [13]:
train_lr(X_train, X_test, y_train, y_test,alpha=.2)

Loss: 0.101
Loss: 0.064
Loss: 0.050
Loss: 0.044
Loss: 0.040
Train Accuracy:0.576


In [14]:
train_lr(X_train, X_test, y_train, y_test,alpha=.5)

Loss: 0.001
Loss: 0.001
Loss: 0.001
Loss: 0.001
Loss: 0.001
Train Accuracy:0.545
