[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# AI Program

## Deep Learning - Convolution Neural Network - MNIST Stroke

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 30/08/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0002PointLine.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Scientific Python

# Image Processing & Computer Vision
import skimage as ski

# Machine Learning
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import ConfusionMatrixDisplay

# Deep Learning
import torch
import torch.nn            as nn
import torch.nn.functional as F
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import torchinfo
from torchmetrics.classification import MulticlassAccuracy
import torchvista

# Python Library
from enum import auto, Enum, unique
import json
import math
import os
import pickle
from platform import python_version
import random
import time

# Miscellaneous
import onedrivedownloader

# Typing 
from typing import Callable, Dict, List, Literal, Optional, Self, Tuple
from numpy.typing import NDArray

# Visualization
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

 ```python
 # You need to start writing
 ?????
 ```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

# warnings.filterwarnings('ignore')

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme
# sns.set_palette('tab10')

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

TU_MNIST_IMG_SIZE = (28, 28)

D_CLASSES_MNIST = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9'}
L_CLASSES_MNIST = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

π = math.pi

BASE_NAME   = 'FixelCourses'
DATA_FOLDER = 'DataSets'

BASE_PATH = os.getcwd()[:(len(os.getcwd()) - (os.getcwd()[::-1].lower().find(BASE_NAME.lower()[::-1])))]
DATA_PATH = os.path.join(BASE_PATH, DATA_FOLDER)

# See https://docs.python.org/3/library/enum.html
@unique
class NNMode(Enum):
    TRAIN     = auto()
    INFERENCE = auto() 

In [None]:
# Course Packages


In [None]:
# Auxiliary Functions

def ParseMnistStrokeSample( sampleIdx: int, /, *, dataPath: str = '.', dataSet: Literal['Train', 'Test'] = 'Train' ) -> Tuple[List[NDArray], int]:

    fileName = f'{dataSet}_{(sampleIdx):05d}.json' #<! Filenames are 0-59_999

    with open(os.path.join(dataPath, fileName), 'r') as hFile:
        dData = json.load(hFile)
    
    lS       = dData['strokes']
    labelIdx = dData['label']

    numStrokes = len(lS)
    lXy        = []
    for ii in range(numStrokes):
        lSi = lS[ii]
        numPts = len(lSi)
        mXy = np.zeros(shape = (numPts, 2))
        for jj in range(numPts):
            mXy[jj] = lSi[jj]['x'], lSi[jj]['y']
        
        lXy.append(mXy)

    return lXy, labelIdx

def PlotStroke( lXy: List[NDArray], /, *, labelIdx: Optional[int] = None, hA: Optional[plt.Axes] = None, tFigSize: Tuple[float, float] = (6.4, 4.8) ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = tFigSize)

    hA.set_aspect('equal')

    numStrokes = len(lXy)

    for ii in range(numStrokes):
        hA.scatter(lXy[ii][:, 0], lXy[ii][:, 1], label = f'Stroke: {(ii + 1):02d}')

    tuYLim = hA.get_ylim()
    if tuYLim[1] > tuYLim[0]:
        hA.invert_yaxis()
    
    hA.set_xlim((0, 27))
    hA.set_ylim((27, 0))

    if labelIdx is not None:
        hA.set_title(f'Label: {labelIdx}')
    
    return hA

def TransformStroke( lXy: List[NDArray], numGridPts: int, /, *, interpCls: Callable = sp.interpolate.make_smoothing_spline ) -> NDArray:
    # There are more advanced approaches to Parametric Spline Curves.
    # See Uniform parameterization (This implementation), Cord Length (Chordal) parametrization, Centripetal parametrization, etc...

    mXY = np.concatenate(lXy, axis = 0) #<! Concatenate all strokes into a single "stroke"
    vT  = np.linspace(0, 1, mXY.shape[0]) #<! Parametric curve
    vTT = np.linspace(0, 1, numGridPts) #<! Parametric curve to be aligned to

    oIntrp = interpCls(vT, mXY) #<! Interpolator

    mXY = oIntrp(vTT)

    return mXY

def PlotMnistImages( mX: NDArray, vY: NDArray, numRows: int, numCols: Optional[int] = None, tuImgSize: Tuple = (28, 28), randomChoice: bool = True, lClasses: Optional[List] = None, hF: Optional[plt.Figure] = None ) -> plt.Figure:

    numSamples  = mX.shape[0]
    numPx       = mX.shape[1]

    if numCols is None:
        numCols = numRows

    tFigSize = (numCols * 3, numRows * 3)

    if hF is None:
        hF, hA = plt.subplots(numRows, numCols, figsize = tFigSize)
    else:
        hA = hF.axes
    
    hA = np.atleast_1d(hA) #<! To support numImg = 1
    hA = hA.flat
    
    for kk in range(numRows * numCols):
        idx = np.random.choice(numSamples) if randomChoice else kk
        mI  = np.reshape(mX[idx, :], tuImgSize)
    
        # hA[kk].imshow(mI.clip(0, 1), cmap = 'gray')
        if len(tuImgSize) == 2:
            hA[kk].imshow(mI, cmap = 'gray')
        elif len(tuImgSize) == 3:
            hA[kk].imshow(mI)
        else:
            raise ValueError(f'The length of the image size tuple is {len(tuImgSize)} which is not supported')
        hA[kk].tick_params(axis = 'both', left = False, top = False, right = False, bottom = False, 
                           labelleft = False, labeltop = False, labelright = False, labelbottom = False)
        if lClasses is None:
            hA[kk].set_title(f'Index = {idx}, Label = {vY[idx]}')
        else:
            hA[kk].set_title(f'Index = {idx}, Label = {lClasses[vY[idx]]}')
    
    return hF

def PlotLabelsHistogram( vY: NDArray, hA: Optional[plt.Axes] = None, lClass: Optional[List] = None, xLabelRot: Optional[int] = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = (8, 6))
    
    vLabels, vCounts = np.unique(vY, return_counts = True)

    hA.bar(vLabels, vCounts, width = 0.9, align = 'center')
    hA.set_title('Histogram of Classes / Labels')
    hA.set_xlabel('Class')
    hA.set_xticks(vLabels, [f'{labelVal}' for labelVal in vLabels])
    hA.set_ylabel('Count')
    if lClass is not None:
        hA.set_xticklabels(lClass)
    
    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA

def PlotConfusionMatrix(vY: np.ndarray, vYPred: np.ndarray, normMethod: str = None, hA: Optional[plt.Axes] = None, 
                        lLabels: Optional[List] = None, dScore: Optional[Dict] = None, titleStr: str = 'Confusion Matrix', 
                        xLabelRot: Optional[int] = None, valFormat: Optional[str] = None) -> Tuple[plt.Axes, np.ndarray]:

    # Calculation of Confusion Matrix
    mConfMat = confusion_matrix(vY, vYPred, normalize = normMethod)
    oConfMat = ConfusionMatrixDisplay(mConfMat, display_labels = lLabels)
    oConfMat = oConfMat.plot(ax = hA, values_format = valFormat)
    hA = oConfMat.ax_
    if dScore is not None:
        titleStr += ':'
        for scoreName, scoreVal in  dScore.items():
            titleStr += f' {scoreName} = {scoreVal:0.2},'
        titleStr = titleStr[:-1]
    hA.set_title(titleStr)
    hA.grid(False)
    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA, mConfMat

## 1D Convolutional Neural Network (CNN)


* <font color='brown'>(**#**)</font> _Data Leakage_ is a common mistake during the feature engineering phase.

### Features for 1D Signal Classification

One way to classify different features of 1D signals would be:

 - Statistical Features  
   Treat the data as a set of values.    
   Summarize data using descriptive statistics.  
   Insensitive to the ordering of observations are included in this set.  
   <font color='magenta'>Example</font>: Mean, Variance, Skewness, Kurtosis, Percentiles, Entropy.
 - Temporal Features  
   Features analyze the changes and patterns in data over time.  
   Sensitive to the order of the samples.  
   Captures temporal correlations, trends and rate of changes.  
   <font color='magenta'>Example</font>: Mean, Variance, Skewness, Kurtosis, Percentiles.
 - Spectral Features
 - Structural Features


In [None]:
# Parameters

fileUrl     = 'https://technionmail-my.sharepoint.com/:u:/g/personal/royia_technion_ac_il/EUXCDJ40oItKofZ9E5tmSfMB_QZlZ3-N_-uc7WYGafQf8Q?e=rxEgx0' #<! OneDrive
dataSetName = 'MNISTStroke'

numSamplesTrain = 10_000
numSamplesTest  = 1_000

numImg = 3

# Features
numGridPts   = 32
interpModel  = sp.interpolate.PchipInterpolator
flatFeatures = False

# Training
batchSize   = 256
numWork     = 0 #<! Number of workers
nEpochs     = 20

# Visualization
exportFig = False

## Generate Data


### The MNIST Stroke Dataset

The MNIST Data Set s the "Hello World" dataset of Machine Learning.


* <font color='red'>(**?**)</font> Will the solution of the Squared Euclidean Distance be the same as the Euclidean Distance?

In [None]:
# Generate / Load Data 

mX, vY = fetch_openml('mnist_784', version = 1, return_X_y = True, as_frame = False, parser = 'auto')
vY = vY.astype(np.int_) #<! The labels are strings, convert to integer

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')
print(f'The unique values of the labels: {np.unique(vY)}')

In [None]:
# Generate / Load Data 

if (not os.path.isdir(os.path.join(DATA_PATH, dataSetName))):
    dataSetPath = onedrivedownloader.download(fileUrl, filename = os.path.join(DATA_PATH, dataSetName + '.zip'), unzip = True, unzip_path = DATA_PATH)
    dataSetPath = os.path.join(dataSetPath, dataSetName)
else:
    dataSetPath = os.path.join(DATA_PATH, dataSetName)

In [None]:
# Parse Single File

sampleIdx = random.randrange(60_000)
lXy, labelIdx = ParseMnistStrokeSample(sampleIdx, dataPath = dataSetPath)

In [None]:
# Plot the Sample
hF, vHa = plt.subplots(nrows = 1, ncols = 2, figsize = (12.8, 4.8))
vHa = vHa.flat

hA = vHa[0]
hA.imshow(np.reshape(mX[sampleIdx], TU_MNIST_IMG_SIZE), cmap = 'gray', vmin = 0, vmax = 255)

hA = vHa[1]
hA = PlotStroke(lXy, hA = hA)
hA.legend();

In reality the order of points matter (First to last).

Idea:
 - Data Level:
    - Padding.
    - Interpolation.
    - Clustering.
 - Model
    - Model for _various length_ sequence (RNN, Transformers).

In [None]:
# Interpolation

lXXYY = [TransformStroke(lXy, numGridPts, interpCls = interpModel)]

hF, vHa = plt.subplots(nrows = 1, ncols = 3, figsize = (19.2, 4.8))
vHa = vHa.flat

hA = vHa[0]
hA.imshow(np.reshape(mX[sampleIdx], TU_MNIST_IMG_SIZE), cmap = 'gray', vmin = 0, vmax = 255)

hA = vHa[1]
hA = PlotStroke(lXy, hA = hA)
hA.legend();

hA = vHa[2]
hA = PlotStroke(lXXYY, hA = hA)
hA.legend();

In [None]:
# Interpolator Effect

tuInterpModel = (
    ('Cubic Spline', sp.interpolate.CubicSpline),
    ('Akima', sp.interpolate.Akima1DInterpolator),
    ('PChip', sp.interpolate.PchipInterpolator),
    ('BSpline', sp.interpolate.make_interp_spline),
    ('Piece Wise Linear', lambda x, y: sp.interpolate.make_interp_spline(x, y, k = 1)),
    ('Smooth Spline', sp.interpolate.make_smoothing_spline),
)

hF, vHa = plt.subplots(nrows = 1, ncols = 1 + len(tuInterpModel), figsize = (18, 4))
vHa = vHa.flat

hA = vHa[0]
hA = PlotStroke(lXy, hA = hA)
hA.set_title('Original Strokes')
hA.legend();

for ii, (interpModelName, oInterpModel) in enumerate(tuInterpModel):
    hA = vHa[ii + 1]
    lXXYY = [TransformStroke(lXy, numGridPts, interpCls = oInterpModel)]
    PlotStroke(lXXYY, hA = hA)
    hA.set_title(interpModelName);

In [None]:
# PyTorch Data Loader
class MNISTStrokeDataset(Dataset):
    oDefInt = sp.interpolate.PchipInterpolator
    def __init__( self, dataPath: str, dataSet: Literal['Test', 'Train'], /, *, numGridPts: int = 32, interpModel: Callable = oDefInt, flatFeatures: bool = False ) -> None:
        TEST_FILE_NAME  = 'TEST.pkl'
        TRAIN_FILE_NAME = 'TRAIN.pkl'

        if dataSet not in ['Test', 'Train']:
            raise ValueError(f'The value of `"dataSet"` = {dataSet} must be either `"dataSet"` or `"Test"`')
        
        match dataSet:
            case 'Test':
                dataFileName = TEST_FILE_NAME
            case 'Train':
                dataFileName = TRAIN_FILE_NAME
        
        dataFilePath = os.path.join(dataPath, dataFileName)
        if os.path.isfile(dataFilePath):
            # Load RAW data
            with open(dataFilePath, 'rb') as hFile:
                dData = pickle.load(hFile)
                lS = dData['lStrokes']
                lY = dData['lY']                    
        else:
            # Generate RAW data and save
            lFiles = os.listdir(dataPath)
            lFiles = [f for f in lFiles if dataSet in f]
            lFiles.sort()

            lS = [] #<! Strokes per Image
            lY = [] #<! Labels

            for ii in range(len(lFiles)):
                lXy, labelIdx = ParseMnistStrokeSample(ii, dataPath = dataPath, dataSet = dataSet)
                lS.append(lXy)
                lY.append(labelIdx)
            
            dData = {'lStrokes': lS, 'lY': lY}
            # Save RAW data
            with open(dataFilePath, 'wb') as hFile:
                pickle.dump(dData, hFile)
        
        lX = [] #<! Features
        for ii in range(len(lS)):
            lXy = lS[ii]
            mXY = TransformStroke(lXy, numGridPts, interpCls = interpModel)
            lX.append(mXY)
        
        self.dataPath     = dataPath
        self.dataSet      = dataSet
        self.numGridPts   = numGridPts
        self.interpModel  = interpModel
        self.flatFeatures = flatFeatures
        
        self.lS = lS
        self.lX = lX
        self.lY = lY
        self.numSamples = len(lX)

    def __len__( self: Self ) -> int:
        
        return self.numSamples

    def __getitem__( self: Self, idx: int ) -> Tuple[NDArray, int]:
        
        mX   = self.lX[idx] #<! Features (numGridPts, 2)
        valY = self.lY[idx] #<! Label

        mX = mX.astype(np.float32) #<! PyTorch default float on GPU's

        # Set the channels
        # Signal should be (numChannels, numSamples)
        if self.flatFeatures:
            # Return a flat vector of features
            mX = np.reshape(np.ravel(mX), (1, -1)) #<! Set channel to 1: (1, 2 * numGridPts)
        else:
            mX = np.transpose(mX, (1, 0)) #<! Set the channels (2, numGridPts)
        
        # PyTorch's Dataloader collates into tensors only NumPy's elements
        valY = np.int64(valY)

        return mX, valY

In [None]:
# Data Sets
# Define PyTorch Dataset
dsTrain = MNISTStrokeDataset(dataSetPath, 'Train', numGridPts = numGridPts, interpModel = interpModel, flatFeatures = flatFeatures)
dsTest  = MNISTStrokeDataset(dataSetPath, 'Test', numGridPts = numGridPts, interpModel = interpModel, flatFeatures = flatFeatures)

In [None]:
# DataSet as Iterator

sampleIdx = random.randrange(len(dsTrain))
mX, valY = dsTrain[sampleIdx]

hF, hA = plt.subplots(nrows = 1, ncols = 1, figsize = (6.4, 4.8))

hA = PlotStroke([mX.T], hA = hA)
hA.set_title(f'Label: {valY}');

In [None]:
# Data Loaders

# Data is small, no real need for workers
dlTrain = DataLoader(dsTrain, shuffle = True, batch_size = 1 * batchSize, num_workers = numWork, drop_last = True, persistent_workers = False)
dlTest  = DataLoader(dsTest, shuffle = False, batch_size = 2 * batchSize, num_workers = numWork, persistent_workers = False)

In [None]:
# Iterate on the Loader
# The first batch.
tX, vY = next(iter(dlTrain)) #<! PyTorch Tensors

print(f'The batch features dimensions: {tX.shape}')
print(f'The batch labels dimensions: {vY.shape}')

In [None]:
# Model
# Defining a sequential model.

numFeatures = mX.shape[1]

def GetModel( numChannels: int, numCls: int ) -> nn.Module:
    # Assumes the input size is >= 32 samples.
    # Assumes the output shape (`numCls`) smaller than 256.
    oModel = nn.Sequential(
        nn.Identity(),
        
        nn.Conv1d(in_channels = numChannels,   out_channels = 32,  kernel_size = 3), nn.MaxPool1d(kernel_size = 2), nn.ReLU(),
        nn.Conv1d(in_channels = 32,            out_channels = 64,  kernel_size = 3),                                nn.ReLU(),
        nn.Conv1d(in_channels = 64,            out_channels = 128, kernel_size = 3), nn.MaxPool1d(kernel_size = 2), nn.ReLU(),
        nn.Conv1d(in_channels = 128,           out_channels = 256, kernel_size = 3),                                nn.ReLU(),
                
        nn.AdaptiveAvgPool1d(output_size = 1), #<! A trick to support arbitrary input size before the "Linear Layer" section
        nn.Flatten          (),
        nn.Linear           (in_features = 256,        out_features = 2 * numCls),
        nn.Linear           (in_features = 2 * numCls, out_features = numCls),
    )
    
    return oModel

* <font color='brown'>(**#**)</font> The _SoftMax_ layer is better be part of the Cross Entropy Loss function. See [PyTorch's `CrossEntropyLoss`](https://docs.pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html).

In [None]:
# Model Summary

numChannels = 1 if flatFeatures else 2
numCls      = len(L_CLASSES_MNIST)

oModel = GetModel(numChannels, numCls)
torchinfo.summary(oModel, tX.shape, col_names = ['kernel_size', 'input_size', 'output_size', 'num_params'], device = 'cpu')

In [None]:
torchvista.trace_model(oModel, torch.randn(tX.shape), height = 600)

In [None]:
# Run Model
# Apply a test run.

with torch.inference_mode():
    vYHat = oModel(tX)

print(f'The input dimensions : {tX.shape}')
print(f'The output dimensions: {vYHat.shape}')

## Training Loop

In [None]:
# Check GPU Availability

runDevice   = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #<! The 1st CUDA device
oModel      = oModel.to(runDevice) #<! Transfer model to device

Use https://datascience.stackexchange.com/a/100198 to show the difference between Macro and Micro

In [None]:
# Loss and Score Function

hL = nn.CrossEntropyLoss()
# Macro: Calculate per class, average over classes
# Micro: Calculate over all data samples
hS = MulticlassAccuracy(num_classes = numCls, average = 'micro')
hL = hL.to(runDevice) #<! Not required!
hS = hS.to(runDevice)

In [None]:
def RunEpoch( oModel: nn.Module, dlData: DataLoader, hL: Callable, hS: Callable, oOpt: Optional[Optimizer] = None, opMode: NNMode = NNMode.TRAIN ) -> Tuple[float, float]:
    """
    Runs a single Epoch (Train / Test) of a model.  
    Input:
        oModel      - PyTorch `nn.Module` object.
        dlData      - PyTorch `Dataloader` object.
        hL          - Callable for the Loss function.
        hS          - Callable for the Score function.
        oOpt        - PyTorch `Optimizer` object.
        opMode      - An `NNMode` to set the mode of operation.
    Output:
        valLoss     - Scalar of the loss.
        valScore    - Scalar of the score.
    Remarks:
      - The `oDataSet` object returns a Tuple of (mX, vY) per batch.
      - The `hL` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).  
        It should return a Tuple of `valLoss` (Scalar of the loss) and `mDz` (Gradient by the loss).
      - The `hS` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).  
        It should return a scalar `valScore` of the score.
      - The optimizer is required for training mode.
    """
    
    epochLoss  = 0.0
    epochScore = 0.0
    numSamples = 0
    numBatches = len(dlData)

    runDevice = next(oModel.parameters()).device #<! CPU \ GPU

    if opMode == NNMode.TRAIN:
        oModel.train(True) #<! Equivalent of `oModel.train()`
        trainMode = True
    elif opMode == NNMode.INFERENCE:
        oModel.eval() #<! Equivalent of `oModel.train(False)`
        trainMode = False
    else:
        raise ValueError(f'The `opMode` value {opMode} is not supported!')
    
    for ii, (mX, vY) in enumerate(dlData):
        # Move Data to Model's device
        mX = mX.to(runDevice) #<! Lazy
        vY = vY.to(runDevice) #<! Lazy

        batchSize = mX.shape[0]
        
        if opMode == NNMode.TRAIN:
            # Forward
            mZ      = oModel(mX) #<! Model output
            valLoss = hL(mZ, vY) #<! Loss
            
            # Backward
            oOpt.zero_grad()   #<! Set gradients to zeros
            valLoss.backward() #<! Backward
            oOpt.step()        #<! Update parameters
            oModel.eval()      #<! Inference mode for layers
        else: #<! Value of `opMode` was already validated
            with torch.inference_mode(): #<! The `torch.inference_mode()` scope is more optimized than `torch.no_grad()` 
                # No computational graph
                mZ      = oModel(mX) #<! Model output
                valLoss = hL(mZ, vY) #<! Loss

        with torch.inference_mode():
            # Score
            oModel.eval() #<! Ensure Evaluation Mode (Dropout / Normalization layers)
            valScore = hS(mZ, vY)
            # Normalize so each sample has the same weight
            epochLoss  += batchSize * valLoss.item()
            epochScore += batchSize * valScore.item()
            numSamples += batchSize
            oModel.train(trainMode) #<! Restore original mode

        print(f'\r{"Train" if trainMode else "Val"} - Iteration: {(ii + 1):3d} / {numBatches}, Loss: {valLoss:.6f}', end = '')
    
    print('', end = '\r')
            
    return epochLoss / numSamples, epochScore / numSamples

In [None]:
def TrainModel( oModel: nn.Module, dlTrain: DataLoader, dlVal: DataLoader, oOpt: Optimizer, numEpoch: int, hL: Callable, hS: Callable, *, oSch: Optional[LRScheduler] = None ) -> Tuple[nn.Module, List, List, List, List]:
    """
    Trains a model given test and validation data loaders.  
    Input:
        oModel      - PyTorch `nn.Module` object.
        dlTrain     - PyTorch `Dataloader` object (Training).
        dlVal       - PyTorch `Dataloader` object (Validation).
        oOpt        - PyTorch `Optimizer` object.
        numEpoch    - Number of epochs to run.
        hL          - Callable for the Loss function.
        hS          - Callable for the Score function.
        oSch        - PyTorch `Scheduler` (`LRScheduler`) object.
        oTBWriter   - PyTorch `SummaryWriter` object (TensorBoard).
    Output:
        lTrainLoss  - Scalar of the loss.
        lTrainScore - Scalar of the score.
        lValLoss    - Scalar of the score.
        lValScore   - Scalar of the score.
        lLearnRate  - Scalar of the score.
    Remarks:
      - The `oDataSet` object returns a Tuple of (mX, vY) per batch.
      - The `hL` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).  
        It should return a Tuple of `valLoss` (Scalar of the loss) and `mDz` (Gradient by the loss).
      - The `hS` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).  
        It should return a scalar `valScore` of the score.
      - The optimizer is required for training mode.
    """

    lTrainLoss  = []
    lTrainScore = []
    lValLoss    = []
    lValScore   = []
    lLearnRate  = []

    # Support R2
    bestScore = -1e9 #<! Assuming higher is better

    learnRate = oOpt.param_groups[0]['lr']

    for ii in range(numEpoch):
        startTime           = time.time()
        trainLoss, trainScr = RunEpoch(oModel, dlTrain, hL, hS, oOpt, opMode = NNMode.TRAIN) #<! Train
        valLoss,   valScr   = RunEpoch(oModel, dlVal, hL, hS, None, opMode = NNMode.INFERENCE) #<! Score Validation
        if oSch is not None:
            # Adjusting the scheduler on Epoch level
            learnRate = oSch.get_last_lr()[0]
            oSch.step()
        epochTime           = time.time() - startTime

        # Aggregate Results
        lTrainLoss.append(trainLoss)
        lTrainScore.append(trainScr)
        lValLoss.append(valLoss)
        lValScore.append(valScr)
        lLearnRate.append(learnRate)
        
        # Display (Babysitting)
        print('Epoch '              f'{(ii + 1):4d} / ' f'{numEpoch}', end = '')
        print(' | Train Loss: '     f'{trainLoss          :6.3f}', end = '')
        print(' | Val Loss: '       f'{valLoss            :6.3f}', end = '')
        print(' | Train Score: '    f'{trainScr           :6.3f}', end = '')
        print(' | Val Score: '      f'{valScr             :6.3f}', end = '')
        print(' | Epoch Time: '     f'{epochTime          :5.2f}', end = '')

        # Save best model ("Early Stopping")
        if valScr > bestScore:
            bestScore = valScr
            try:
                dCheckPoint = {'Model': oModel.state_dict(), 'Optimizer': oOpt.state_dict()}
                if oSch is not None:
                    dCheckPoint['Scheduler'] = oSch.state_dict()
                torch.save(dCheckPoint, 'BestModel.pt')
                print(' | <-- Checkpoint!', end = '')
            except:
                print(' | <-- Failed!', end = '')
        print(' |')
    
    # Load best model ("Early Stopping")
    dCheckPoint = torch.load('BestModel.pt')
    oModel.load_state_dict(dCheckPoint['Model'])

    return oModel, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate

In [None]:
# Define Optimizer

oOpt = torch.optim.AdamW(oModel.parameters(), lr = 1e-3, betas = (0.9, 0.99), weight_decay = 1e-3) #<! Define optimizer

In [None]:
# Define Scheduler

oSch = torch.optim.lr_scheduler.OneCycleLR(oOpt, max_lr = 5e-3, total_steps = nEpochs)

In [None]:
# Train Model

oModel, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate = TrainModel(oModel, dlTrain, dlTest, oOpt, nEpochs, hL, hS, oSch = oSch)

In [None]:
# Plot Training Phase

hF, vHa = plt.subplots(nrows = 1, ncols = 3, figsize = (12, 5))
vHa = np.ravel(vHa)

hA = vHa[0]
hA.plot(lTrainLoss, lw = 2, label = 'Train')
hA.plot(lValLoss, lw = 2, label = 'Validation')
hA.set_title('Binary Cross Entropy Loss')
hA.set_xlabel('Epoch')
hA.set_ylabel('Loss')
hA.legend()

hA = vHa[1]
hA.plot(lTrainScore, lw = 2, label = 'Train')
hA.plot(lValScore, lw = 2, label = 'Validation')
hA.set_title('Accuracy Score')
hA.set_xlabel('Epoch')
hA.set_ylabel('Score')
hA.legend()

hA = vHa[2]
hA.plot(lLearnRate, lw = 2)
hA.set_title('Learn Rate Scheduler')
hA.set_xlabel('Epoch')
hA.set_ylabel('Learn Rate');

In [None]:
# Analysis

# Aggregate results for Train Set

lYPred = []
lY     = []

for ii, (tX, vY) in enumerate(dlTrain):
    # Move Data to Model's device
    tX = tX.to(runDevice) #<! Lazy
    vY = vY.to(runDevice) #<! Lazy
        
    with torch.inference_mode():
        mZ = oModel(tX) #<! Model output
        vYPred = torch.argmax(mZ, dim = 1)
    
    lYPred.append(vYPred.detach().cpu().numpy())
    lY.append(vY.detach().cpu().numpy())

vYPredTrain  = np.concat(lYPred, axis = 0)
vYTruthTrain = np.concat(lY, axis = 0)

In [None]:
# Analysis

# Aggregate results for Test Set

lYPred = []
lY     = []

for ii, (tX, vY) in enumerate(dlTest):
    # Move Data to Model's device
    tX = tX.to(runDevice) #<! Lazy
    vY = vY.to(runDevice) #<! Lazy
        
    with torch.inference_mode():
        mZ = oModel(tX) #<! Model output
        vYPred = torch.argmax(mZ, dim = 1)
    
    lYPred.append(vYPred.detach().cpu().numpy())
    lY.append(vY.detach().cpu().numpy())

vYPredTest  = np.concat(lYPred, axis = 0)
vYTruthTest = np.concat(lY, axis = 0)

In [None]:
# Analysis

# Scoring Report - Micro Averaging Policy
valAccuracy = accuracy_score(vYTruthTest, vYPredTest)
valPrecision, valRecall, valF1, _ = precision_recall_fscore_support(vYTruthTest, vYPredTest, average = 'micro', labels = L_CLASSES_MNIST)

# Print Report
print(f'Scoring Report (Micro Averaging Policy)')
print(f'Accuracy : {valAccuracy:.4f}')
print(f'Precision: {valPrecision:.4f}')
print(f'Recall   : {valRecall:.4f}')
print(f'F1 Score : {valF1:.4f}')

# Scoring Report - Macro Averaging Policy
valPrecision, valRecall, valF1, _ = precision_recall_fscore_support(vYTruthTest, vYPredTest, average = 'macro', labels = L_CLASSES_MNIST)

# Print Report
print(f'Scoring Report (Macro Averaging Policy)')
print(f'Precision: {valPrecision:.4f}')
print(f'Recall   : {valRecall:.4f}')
print(f'F1 Score : {valF1:.4f}')

* <font color='red'>(**?**)</font> How come all scores are similar?

In [None]:
# Analysis

# Scoring Report
print('Classification Report')
lClassLabel = [D_CLASSES_MNIST[k] for k in L_CLASSES_MNIST]
print(classification_report(vYTruthTest, vYPredTest, labels = L_CLASSES_MNIST, target_names = lClassLabel)) #<! Parameter `target_names` requires strings

In [None]:
# Analysis
# Confusion Matrix

hF, vHa = plt.subplots(nrows = 1, ncols = 2, figsize = (14, 6))

hA, _ = PlotConfusionMatrix(vYTruthTrain, vYPredTrain, hA = vHa[0])
hA.set_title(f'Train Data, Accuracy {np.mean(vYTruthTrain == vYPredTrain): 0.2%}')

hA, _ = PlotConfusionMatrix(vYTruthTest, vYPredTest, hA = vHa[1])
hA.set_title(f'Test Data, Accuracy {np.mean(vYTruthTest == vYPredTest): 0.2%}');