[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# Deep Learning Methods

## Deep Learning - Label Smoothing

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 04/06/2024 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0096DeepLearningLabelSmoothing.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Deep Learning
import torch
import torch.nn            as nn
import torch.nn.functional as F
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torchinfo
from torchmetrics.classification import MulticlassAccuracy
import torchvision
from torchvision.transforms import v2 as TorchVisionTrns

# Miscellaneous
import copy
import os
from platform import python_version
import random

# Typing
from typing import Callable, Dict, Generator, List, Optional, Self, Set, Tuple, Union

# Visualization
import matplotlib.pyplot as plt

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

# Improve performance by benchmarking
torch.backends.cudnn.benchmark = True

# Reproducibility (Per PyTorch Version on the same device)
# torch.manual_seed(seedNum)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark     = False #<! Makes things slower

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

DATA_FOLDER_NAME           = 'DataSets'
TENSOR_BOARD_FOLDER_NAME   = 'TB'

BASE_FOLDER_NAME = 'FixelCourses'
BASE_FOLDER_PATH = os.getcwd()[:(len(os.getcwd()) - (os.getcwd()[::-1].lower().find(BASE_FOLDER_NAME.lower()[::-1])))]

D_CLASSES_CIFAR_10  = {0: 'Airplane', 1: 'Automobile', 2: 'Bird', 3: 'Cat', 4: 'Deer', 5: 'Dog', 6: 'Frog', 7: 'Horse', 8: 'Ship', 9: 'Truck'}
L_CLASSES_CIFAR_10  = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']
T_IMG_SIZE_CIFAR_10 = (32, 32, 3)

T_IMG_SIZE = T_IMG_SIZE_CIFAR_10
L_CLASSES  = L_CLASSES_CIFAR_10

In [None]:
# Download Auxiliary Modules for Google Colab
if runInGoogleColab:
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DataManipulation.py
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DataVisualization.py
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DeepLearningPyTorch.py

In [None]:
# Courses Packages

from DataVisualization import PlotLabelsHistogram, PlotMnistImages
from DeepLearningPyTorch import ResidualBlock, TBLogger, TestDataSet
from DeepLearningPyTorch import InitWeightsKaiNorm, TrainModel, TrainModelSch

In [None]:
# General Auxiliary Functions

def GenResNetModel( trainedModel: bool, numCls: int, resNetDepth: int = 18 ) -> nn.Module:
    # Read on the API change at: How to Train State of the Art Models Using TorchVision’s Latest Primitives
    # https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives

    if (resNetDepth == 18):
        modelFun = torchvision.models.resnet18
        modelWeights = torchvision.models.ResNet18_Weights.IMAGENET1K_V1
    elif (resNetDepth == 34):
        modelFun = torchvision.models.resnet34
        modelWeights = torchvision.models.ResNet34_Weights.IMAGENET1K_V1
    else:
        raise ValueError(f'The `resNetDepth`: {resNetDepth} is invalid!')

    if trainedModel:
        oModel = modelFun(weights = modelWeights)
        numFeaturesIn   = oModel.fc.in_features
        # Assuming numCls << 100
        oModel.fc       = nn.Sequential(
            nn.Linear(numFeaturesIn, 128), nn.ReLU(),
            nn.Linear(128, numCls),
        )
    else:
        oModel = modelFun(weights = None, num_classes = numCls)

    return oModel

## Label Smoothing

The motivation for _Label Smoothing_ is avoiding numerical issues related to the $log$ function of the _Cross Entropy_ loss.

What's the contribution of Label Smoothing:

 - Makes the model less sensitive to "noisy labeling" (By limiting the loss).  
 - Regularizes the overfitting on correct examples.
 - Regularizes the "confidence" of the model and improves its calibration.

This notebook demonstrates the use of _Label Smoothing_ for image classification.

</br>

* <font color='brown'>(**#**)</font> _Label Smoothing_ is less effective in Binary Classification.  
  As its main contribution is by "clustering" the wrong labels together with equal probability it has little effect for the binary case.
* <font color='brown'>(**#**)</font> See
    * [When Does Label Smoothing Help](https://arxiv.org/abs/1906.02629).
    * [Delving Deep into Label Smoothing](https://arxiv.org/abs/2011.12562).
* <font color='brown'>(**#**)</font> [What is Label Smoothing](https://scribe.rip/108debd7ef06).
* <font color='brown'>(**#**)</font> [Lei Mao - Label Smoothing](https://leimao.github.io/blog/Label-Smoothing/).

In [None]:
# Parameters

# Data
dataFolderPath = os.path.join(BASE_FOLDER_PATH, DATA_FOLDER_NAME)

# Model
dropP = 0.5 #<! Dropout Layer

# Training
batchSize   = 256
numWorkers  = 4 #<! Number of workers
numEpochs   = 25

# Visualization
numImg = 3

## Generate / Load Data

Load the [CIFAR 10 Data Set](https://en.wikipedia.org/wiki/CIFAR-10).  
It is composed of 60,000 RGB images of size `32x32` with 10 classes uniformly spread.

* <font color='brown'>(**#**)</font> The dataset is retrieved using [Torch Vision](https://pytorch.org/vision/stable/index.html)'s built in datasets.  


In [None]:
# Load Data

dsTrain = torchvision.datasets.CIFAR10(root = dataFolderPath, train = True,  download = True, transform = torchvision.transforms.ToTensor())
dsVal   = torchvision.datasets.CIFAR10(root = dataFolderPath, train = False, download = True, transform = torchvision.transforms.ToTensor())
lClass  = dsTrain.classes

print(f'The training data set data shape: {dsTrain.data.shape}')
print(f'The test data set data shape: {dsVal.data.shape}')
print(f'The unique values of the labels: {np.unique(lClass)}')

* <font color='brown'>(**#**)</font> The dataset is indexible (Subscriptable). It returns a tuple of the features and the label.
* <font color='brown'>(**#**)</font> While data is arranged as `H x W x C` the transformer, when accessing the data, will convert it into `C x H x W`. 

In [None]:
# Element of the Data Set

mX, valY = dsTrain[0]

print(f'The features shape: {mX.shape}')
print(f'The label value: {valY}')

### Plot the Data

In [None]:
# Extract Data

tX = dsTrain.data #<! NumPy Tensor (NDarray)
mX = np.reshape(tX, (tX.shape[0], -1))
vY = dsTrain.targets #<! NumPy Vector

In [None]:
# Plot the Data

hF = PlotMnistImages(mX, vY, numImg, tuImgSize = T_IMG_SIZE)

In [None]:
# Histogram of Labels

hA = PlotLabelsHistogram(vY, lClass = L_CLASSES);

* <font color='red'>(**?**)</font> If data is converted into _grayscale_, how would it effect the performance of the classifier? Explain.  
  You may assume the conversion is done using the mean value of the RGB pixel.

## Pre Process Data

This section normalizes the data to have zero mean and unit variance per **channel**.  
It is required to calculate:

 * The average pixel value per channel.
 * The standard deviation per channel.

</br>

* <font color='brown'>(**#**)</font> The values calculated on the train set and applied to both sets.
* <font color='brown'>(**#**)</font> The the data will be used to pre process the image on loading by the `transformer`.
* <font color='brown'>(**#**)</font> There packages which specializes in transforms: [`Kornia`](https://github.com/kornia/kornia), [`Albumentations`](https://github.com/albumentations-team/albumentations).  
  They are commonly used for _Data Augmentation_ at scale.

In [None]:
# Calculate the Standardization Parameters
vMean = np.mean(dsTrain.data / 255.0, axis = (0, 1, 2))
vStd  = np.std(dsVal.data / 255.0, axis = (0, 1, 2))

print('µ =', vMean)
print('σ =', vStd)

In [None]:
# Update Transformer

oTrnsTrain = TorchVisionTrns.Compose([
    TorchVisionTrns.ToImage(),
    TorchVisionTrns.RandomHorizontalFlip(), #<! Can be done in UINT8 for faster performance
    TorchVisionTrns.AutoAugment(policy = TorchVisionTrns.AutoAugmentPolicy.CIFAR10), #<! Requires `UINT8`
    TorchVisionTrns.ToDtype(torch.float32, scale = True),
    TorchVisionTrns.Normalize(vMean, vStd)
])

oTrnsInfer = TorchVisionTrns.Compose([
    TorchVisionTrns.ToImage(),
    TorchVisionTrns.ToDtype(torch.float32, scale = True),
    TorchVisionTrns.Normalize(vMean, vStd)
])

# Update the DS transformer
dsTrain.transform   = oTrnsTrain
dsVal.transform     = oTrnsInfer

In [None]:
# "Normalized" Image
# Showing the images with the transformation applied

imgIdx = 5

N, H, W, C = dsTrain.data.shape
hF, vHA = plt.subplots(2, 3, figsize = (8, 4))
vHA = vHA.flat
for hA in vHA:
    mX, valY = dsTrain[imgIdx] #<! Random
    mX = torch.permute(mX, (1, 2, 0))
    mX = torch.clip(mX * torch.tensor(vStd[None, None, :]) + torch.tensor(vMean[None, None, :]), min = 0.0, max = 1.0)
    hA.imshow(mX.numpy())
    hA.set_title(f'True label: {L_CLASSES[valY]}')
    
hF.tight_layout()

### Data Loaders

This section defines the data loaded.



In [None]:
# Data Loader

dlTrain = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = 1 * batchSize, num_workers = 2, drop_last = True, persistent_workers = True)
dlVal   = torch.utils.data.DataLoader(dsVal, shuffle = False, batch_size = 2 * batchSize, num_workers = 2, persistent_workers = True)

# dlTrain = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = 1 * batchSize, num_workers = 0, drop_last = True)
# dlVal   = torch.utils.data.DataLoader(dsVal, shuffle = False, batch_size = 2 * batchSize, num_workers = 0)

In [None]:
# Iterate on the Loader
# The first batch.
tX, vY = next(iter(dlTrain)) #<! PyTorch Tensors

print(f'The batch features dimensions: {tX.shape}')
print(f'The batch labels dimensions: {vY.shape}')

## Load the Model

This section loads the model.  
The number of outputs is adjusted to match the number of classes in the data.

In [None]:
# Loading a Pre Defined Model
oModel = GenResNetModel(trainedModel = False, numCls = len(L_CLASSES))
# oModel.apply(InitWeightsKaiNorm)

In [None]:
# Model Information - Pre Defined
# Pay attention to the layers name.
torchinfo.summary(oModel, (batchSize, *(T_IMG_SIZE[::-1])), col_names = ['kernel_size', 'output_size', 'num_params'], device = 'cpu', row_settings = ['depth', 'var_names'])

* <font color='red'>(**?**)</font> Does the last (_Head_) dense layer includes a bias? Explain.

In [None]:
# Model
# Defining a sequential model.

# numChannels = 128

# def BuildModel( nC: int ) -> nn.Module:

#     oModel = nn.Sequential(
#         nn.Identity(),
#         nn.Conv2d(3, nC, 3, padding = 1, bias = False),  nn.BatchNorm2d(nC), nn.ReLU(),                  nn.Dropout2d(0.2),
#         nn.Conv2d(nC, nC, 3, padding = 1, bias = False), nn.BatchNorm2d(nC), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout2d(0.2),
        
#         ResidualBlock(nC), nn.Dropout2d(0.2),
#         ResidualBlock(nC), nn.Dropout2d(0.2),
#         ResidualBlock(nC), nn.Dropout2d(0.2),
#         ResidualBlock(nC), nn.Dropout2d(0.2),
#         ResidualBlock(nC), nn.Dropout2d(0.2),
        
#         nn.AdaptiveAvgPool2d(1),
#         nn.Flatten(),
#         nn.Linear(nC, 10)
#     )

#     oModel.apply(InitWeightsKaiNorm)

#     return oModel

# oModel = BuildModel(numChannels)

# torchinfo.summary(oModel, (batchSize, 3, 32, 32), col_names = ['kernel_size', 'output_size', 'num_params'], device = 'cpu')

## Train the Model

This section trains the model.  
It compares results with and without _Label Smoothing_.

* <font color='brown'>(**#**)</font> The objective is to show how to apply _Label Smoothing_.  

### Label Smoothing

* Cross Entropy Loss
$$
\ell_{\mathrm{CE}}\left(\boldsymbol{y}_{i},\hat{\boldsymbol{y}}_{i}\right)=-\left\langle \boldsymbol{y}_{i},\log\left(\hat{\boldsymbol{y}}_{i}\right)\right\rangle =-\left\langle \left[\begin{matrix}0\\
1\\
0\\
0
\end{matrix}\right],\log\left(\left[\begin{matrix}0.1\\
0.75\\
0.05\\
0.1
\end{matrix}\right]\right)\right\rangle 
$$
* Label Smoothing Loss
$$
\ell_{\mathrm{LS}}\left(\boldsymbol{y}_{i},\hat{\boldsymbol{y}}_{i}\right)=-\left\langle \left[\begin{matrix}\frac{\epsilon}{3}\\
1-\epsilon\\
\frac{\epsilon}{3}\\
\frac{\epsilon}{3}
\end{matrix}\right],\log\left(\left[\begin{matrix}0.1\\
0.75\\
0.05\\
0.1
\end{matrix}\right]\right)\right\rangle 
$$

* <font color='brown'>(**#**)</font> The value of $\epsilon$ is a hyper parameter.
* <font color='brown'>(**#**)</font> PyTorch's class `CrossEntropyLoss` implements _Label Smoothing_ in its `label_smoothing` parameter.  
  See [[PyTorch][Feature Request] Label Smoothing for CrossEntropyLoss](https://github.com/pytorch/pytorch/issues/7455).
* <font color='brown'>(**#**)</font> The Label Smoothing loss can be written: $\ell_{\mathrm{LS}}\left({\color{cyan}\boldsymbol{y}_{i}},{\color{T}\hat{\boldsymbol{y}}_{i}}\right)=-\left\langle {\color{magenta}\boldsymbol{1}\epsilon}+\left(1-{\color{yellow}C}\cdot{\color{magenta}\epsilon}\right){\color{cyan}\boldsymbol{y}_{i}},\log\left({\color{T}\hat{\boldsymbol{y}}_{i}}\right)\right\rangle$.  
  This can be calculated, by linearity of the _Inner Product_ as 2 CE calculations.


In [None]:
# Label Smoothing by Code
# Display a batch of labels, each row is a sample and each column is a class.

N    = 10                         #<! Number of samples
C    = 4                          #<! Number of classes (Labels)
vIdx = torch.randint(0, C, (N,))  #<! Reference labels

#<! mY1 (One Hot)
mY1 = torch.zeros(N, C) 
mY1 = torch.scatter(mY1, 1, torch.unsqueeze(vIdx, 1), 1.0)

#<! mY2 (Smooth)
ϵ   = 0.3
mY2 = torch.full((N, C), ϵ / (C - 1))
mY2 = torch.scatter(mY2, 1, torch.unsqueeze(vIdx, 1), 1 - ϵ)

hF, hA = plt.subplots(1, 2, figsize = (4, 3))
hImg = hA[0].matshow(mY1, vmin = 0, vmax = 1)
hImg = hA[1].matshow(mY2, vmin = 0, vmax = 1)
hA[0].set_title('$Y_1$')
hA[1].set_title('$Y_2$')
hF.colorbar(hImg);

In [None]:
# Run Device

runDevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #<! The 1st CUDA device

In [None]:
# Loss and Score Function
lϵ = [0.0, 0.1]

hS = MulticlassAccuracy(num_classes = len(lClass), average = 'micro')

hS = hS.to(runDevice)

In [None]:
# Training Loop

dModelHist = {}

for ii, ϵ in enumerate(lϵ):
    modelName = f'ϵ={ϵ:3.2f}'
    print(f'Training model: {modelName}')
    hL = nn.CrossEntropyLoss(label_smoothing = ϵ)
    oRunModel = copy.deepcopy(oModel) #<! Transfer model to device
    oRunModel = oRunModel.to(runDevice)
    oOpt = torch.optim.AdamW(oRunModel.parameters(), lr = 1e-3, betas = (0.9, 0.99), weight_decay = 1e-2) #<! Define optimizer
    oSch = torch.optim.lr_scheduler.OneCycleLR(oOpt, max_lr = 5e-2, total_steps = numEpochs)
    _, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate = TrainModel(oRunModel, dlTrain, dlVal, oOpt, numEpochs, hL, hS, oSch = oSch)

    # oSch = torch.optim.lr_scheduler.OneCycleLR(oOpt, max_lr = 5e-2, total_steps = numEpochs * len(dlTrain))
    # _, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate = TrainModelSch(oRunModel, dlTrain, dlVal, oOpt, oSch, numEpochs, hL, hS)
    dModelHist[modelName] = lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate

* <font color='brown'>(**#**)</font> One way to streamline the run over a grid is using `ParameterGrid` from SciKit Learn.

In [None]:
# Plot Training Phase

hF, vHa = plt.subplots(nrows = 1, ncols = 3, figsize = (18, 5))
vHa = np.ravel(vHa)

for modelKey in dModelHist:
    hA = vHa[0]
    hA.plot(dModelHist[modelKey][0], lw = 2, label = f'Train {modelKey}')
    hA.plot(dModelHist[modelKey][2], lw = 2, label = f'Validation {modelKey}')
    hA.set_title('Cross Entropy Loss')
    hA.set_xlabel('Epoch')
    hA.set_ylabel('Loss')
    hA.legend()

    hA = vHa[1]
    hA.plot(dModelHist[modelKey][1], lw = 2, label = f'Train {modelKey}')
    hA.plot(dModelHist[modelKey][3], lw = 2, label = f'Validation {modelKey}')
    hA.set_title('Accuracy Score')
    hA.set_xlabel('Epoch')
    hA.set_ylabel('Score')
    hA.legend()

    hA = vHa[2]
    hA.plot(lLearnRate, lw = 2, label = f'{modelKey}')
    hA.set_title('Learn Rate Scheduler')
    hA.set_xlabel('Iteration')
    hA.set_ylabel('Learn Rate')
    hA.legend();

* <font color='red'>(**?**)</font> Is the loss landscape comparable between the 2 training phases?