[![Fixel Algorithms](https://i.imgur.com/AqKHVZ0.png)](https://fixelalgorithms.gitlab.io)

# AI Program

## Machine Learning - Deep Learning - MNIST 1D with Hyper Parameter Optimization

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 18/05/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0089DeepLearningPyTorchSchedulers.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning

# Deep Learning
import torch
import torch.nn            as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchinfo
from torchmetrics.classification import MulticlassAccuracy
import torchvision

# ML Ops
import wandb

# Miscellaneous
import math
import os
from platform import python_version
import pickle
import random
from urllib.request import urlopen

# Typing
from typing import Callable, Dict, Generator, List, Literal, Optional, Self, Set, Tuple, Union

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

# Improve performance by benchmarking
torch.backends.cudnn.benchmark = True

# Reproducibility (Per PyTorch Version on the same device)
# torch.manual_seed(seedNum)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark     = False #<! Makes things slower

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

D_CLASSES_MNIST  = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9'}
L_CLASSES_MNIST  = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
T_IMG_SIZE_MNIST = (28, 28, 1)

DATA_FOLDER_PATH  = 'Data'
TENSOR_BOARD_BASE = 'TB'
WANDB_API_KEY     = 'WANDB_API_KEY'

In [None]:
# Download Auxiliary Modules for Google Colab
if runInGoogleColab:
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DataManipulation.py
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DataVisualization.py
    !wget https://raw.githubusercontent.com/FixelAlgorithmsTeam/FixelCourses/master/AIProgram/2024_02/DeepLearningPyTorch.py

In [None]:
# Courses Packages

from DataManipulation import ParseEnvFile
from DataVisualization import PlotLabelsHistogram
from DeepLearningPyTorch import NNMode
from DeepLearningPyTorch import TrainModel

In [None]:
# General Auxiliary Functions

def PlotMnist1D( mX: np.ndarray, vT: np.ndarray, vY: np.ndarray, numRows: int, 
                numCols: Optional[int] = None, randomChoice: bool = True, lClasses: Optional[List] = None, 
                ratioFactor: float = 1.33, zoomLevel: float = 1.0, darkMode: bool = False, hF: Optional[plt.Figure] = None ) -> plt.Figure:

    numSamples  = mX.shape[0]
    numPx       = mX.shape[1]

    if numCols is None:
        numCols = numRows

    tFigSize = (numCols * 2.2, numRows * 2.0 * ratioFactor)

    if hF is None:
        hF, hA = plt.subplots(numRows, numCols, figsize = tFigSize, dpi = 60)
    else:
        hA = hF.axes
    
    hA = np.atleast_1d(hA) #<! To support numImg = 1
    hA = hA.flat
    
    for kk in range(numRows * numCols):
        idx = np.random.choice(numSamples) if randomChoice else kk

        if darkMode:
            hA[kk].plot(mX[idx], vT, 'wo', lw = 6)
            hA[kk].set_facecolor('k')
        else:
            hA[kk].plot(mX[idx], vT, 'k-', lw = 2)
        hA[kk].tick_params(axis = 'both', left = False, top = False, right = False, bottom = False, 
                           labelleft = False, labeltop = False, labelright = False, labelbottom = False)
        hA[kk].set_xlim(-zoomLevel, zoomLevel)
        hA[kk].set_ylim(-zoomLevel, zoomLevel)
        hA[kk].invert_yaxis()
        if lClasses is None:
            hA[kk].set_title(f'Index: {idx}, Label: {vY[idx]}')
        else:
            hA[kk].set_title(f'Index: {idx}, Label: {lClasses[vY[idx]]}')
    
    return hF

## The MNIST 1D Data Set

The MNIST dataset of hand written digits is the _"Hello World"_ of 2D CNNs.  
It’s also famously easy, a simple model can achieve close 100% accuracy with a few minutes (Seconds) of training.

In 2020 Sam Greydanus, Dmitry Kobak generated the MNIST 1D _synthetic dataset_.  
Each sample is composed of 40 values of the coordinates of 1D chain.  
Although the dimensionality of MNIST-1D is only 40 and its default training set size only 4000, MNIST-1D can be pretty challenging to solve.  
Which makes it a great data set for fast prototyping, educational use cases and cutting edge research on a low budget.

<!-- ![](https://i.imgur.com/KGbnhbX.png) -->
<!-- ![](https://i.postimg.cc/mZwXcHjb/KGbnhbX.png) -->
![](https://i.imgur.com/6BKZ19s.png)
<!-- ![](https://i.postimg.cc/cJx6LPL1/image.png) -->

This notebook executes a search for optimization of _hyper parameters_ for best accuracy of the classification results.

The notebook presents:

 * Several concepts of .
 * The use [Weights and Biases](https://wandb.ai) for Hyper Parameter optimization.


</br>

* MNIST is Trivial: [78% MNIST Accuracy Using GZIP in Under 10 Lines of Code](https://jakobs.dev/solving-mnist-with-gzip) ([Hacker News Discussion](https://news.ycombinator.com/item?id=37583593)), [MNIST by ZIP](https://www.blackhc.net/blog/2019/mnist-by-zip).
* <font color='brown'>(**#**)</font> [The MNIST 1D Data Set Repository](https://github.com/greydanus/mnist1d).
* <font color='brown'>(**#**)</font> [The MNIST 1D Data Set Paper - Scaling Down Deep Learning with MNIST-1D](https://arxiv.org/abs/2011.14439).
* <font color='brown'>(**#**)</font> [Sam Greydanus - Natural Intelligence Blog - Scaling down Deep Learning](https://greydanus.github.io/2020/12/01/scaling-down).
* <font color='brown'>(**#**)</font> [Prediction MNIST-1D dataset with Mamba](https://tk42.jp/notes/Image-Classification-MNIST-1D-dataset-with-Mamba-for-beginners).

In [None]:
# Parameters

# Data
dataUrl = r'https://github.com/greydanus/mnist1d/raw/master/mnist1d_data.pkl'

# Model
dropP = 0.5 #<! Dropout Layer

# Training
batchSize   = 256
numWork     = 2 #<! Number of workers
nEpochs     = 10

# Weights and Biases Optimizer
projName  = 'Test'
numSweeps = 50

# Visualization
numImg = 3

## Generate / Load Data

Load the [CIFAR 10 Data Set](https://en.wikipedia.org/wiki/CIFAR-10).  
It is composed of 60,000 RGB images of size `32x32` with 10 classes uniformly spread.

* <font color='brown'>(**#**)</font> The dataset is retrieved using [Torch Vision](https://pytorch.org/vision/stable/index.html)'s built in datasets.  


In [None]:
# Generate / Load Data

dData = pickle.load(urlopen(dataUrl))

In [None]:
# Data

dRefData = dData['templates']

# The vertical axis
vT = dData['t']

mXTrain = dData['x'].astype(np.float32)      #<! NumPy is Float64 by default
vYTrain = dData['y']
mXVal   = dData['x_test'].astype(np.float32) #<! NumPy is Float64 by default
vYVal   = dData['y_test']

numSignalsTrain = mXTrain.shape[0]
numSignalsVal   = mXVal.shape[0]

In [None]:
# PyTorch DataSet

dsTrain  = torch.utils.data.TensorDataset(torch.tensor(np.reshape(mXTrain, (numSignalsTrain, 1, -1))), torch.tensor(vYTrain)) #<! -1 -> Infer
dsVal    = torch.utils.data.TensorDataset(torch.tensor(np.reshape(mXVal, (numSignalsVal, 1, -1))), torch.tensor(vYVal))

print(f'The training data set data shape: {dsTrain.tensors[0].shape}')
print(f'The test data set data shape    : {dsVal.tensors[0].shape}')
print(f'The unique values of the labels : {np.unique(dsTrain.tensors[1])}')

* <font color='brown'>(**#**)</font> The dataset is indexable (Subscriptable). It returns a tuple of the features and the label.

In [None]:
# Element of the Data Set

mX, valY = dsTrain[0]

print(f'The features shape: {mXTrain[0].shape}')
print(f'The label value: {vYTrain[0]}')

### Plot the Data

In [None]:
# Reference Data

hF = PlotMnist1D(dRefData['x'], dRefData['t'], dRefData['y'], numRows = 1, numCols = 10, randomChoice = False, darkMode = True)

In [None]:
# Train Data

hF = PlotMnist1D(mXTrain, vT, vYTrain, numRows = 2, numCols = 10, randomChoice = True, zoomLevel = 4, darkMode = False)

In [None]:
# Histogram of Labels

hA = PlotLabelsHistogram(vYTrain, lClass = L_CLASSES_MNIST)
plt.show()

## Weights and Biases

1. Create account at https://wandb.ai.
2. Login to account.
3. Go to https://wandb.ai/authorize.
4. Copy the API key into the `key` parameter.
5. Save it as `WANDB_API_KEY=<APIKey>` in a `.env` file.

### Working with `.env` File

When working in scale one way to share "secrets" is using `env` file.  
Those are used to configure global variables to be used.

* <font color='brown'>(**#**)</font> A known package to work with `.env` files is given by [`python-dotenv`](https://github.com/theskumar/python-dotenv).

In [None]:
# Parse the Environment File

dEnv        = ParseEnvFile('.env')
wandbApiKey = dEnv[WANDB_API_KEY] #<! Extract the API Key

In [None]:
wandb.login(key = wandbApiKey, verify = True) #<! Do once per computer

## Pre Process Data

This section normalizes the data to have zero mean and unit variance per **channel**.  
It is required to calculate:

 * The average pixel value per channel.
 * The standard deviation per channel.

</br>

* <font color='brown'>(**#**)</font> The values calculated on the train set and applied to both sets.
* <font color='brown'>(**#**)</font> The the data will be used to pre process the image on loading by the `transformer`.
* <font color='brown'>(**#**)</font> There packages which specializes in transforms: [`Kornia`](https://github.com/kornia/kornia), [`Albumentations`](https://github.com/albumentations-team/albumentations).  
  They are commonly used for _Data Augmentation_ at scale.

* <font color='red'>(**?**)</font> What do you expect the mean value to be?
* <font color='red'>(**?**)</font> What do you expect the standard deviation value to be?

In [None]:
# Hyper Parameters

# Weights and Biases Dictionary
dParams = {
    'poolLayer': {
        'values': ['maxPool', 'l2Pool'],
    },
    'activationLayer': {
        'values': ['ReLU', 'SELU', 'Sigmoid'],
    },
    'paramLambda': { #<! Wandb seems to have issues with UTF characters ('λ')
        'distribution': 'log_uniform',
        'min': -7,
        'max': -4,
    },
}

In [None]:
# Sweep Configuration

dSweep =  {
    'method': 'random',
    'metric': {
        'name': 'Score',    #<! The name of the metric to optimize
        'goal': 'maximize',
    },
    'parameters': dParams,
}

### Setting Sweep

The framework allows distributed work on the sweeps by the `sweepId` and `projName` parameters.  
The `sweepId` and `projName` defines a sweep over a set of _hyper parameters_ defined in `dSweep`.
Any node (Computer) will be served by the sweep agent by a set of a specific _hyper parameters_ defined by the sweep configuration.  
The choice of the hyper parameters is orchestrated by the _Weights and Biases_ service.  
This makes easy to have many nodes running the same sweep.

The recipe:

1. Define a sweep by its _Project Name_ (`projName`) and _Configuration_ (`dSweep`).  
   The configuration should set a score (The `name` field in the `metric` dictionary in `dSweep`).
2. Get the _Sweep ID_ (`sweepId`).  
   Make it available to any node (Computer) which should run experiments using the sweep configuration.
3. Define a function which given a specific set of _Hyper Parameters_ defines an experiment which evaluates the algorithm and log the score.  
   The function should get its configuration from the `oRun` object.  
   The function should log the target score using the same label (`name`) defined in the configuration.  
4. Each node which uses the Sweep ID will be configured using the same configuration.  
   The Weights and Biases service will orchestrate the allocation of _hyper parameters_ to each experiment according to the defined policy (`method`). 

In [None]:
# Register the Sweep
# The `sweepId` is a unique identifier for the sweep, which can be used to monitor and control the sweep.
# The `sweepId` should be distributed to any node which should run an experiment in the sweep.

# sweepId = wandb.sweep(dSweep, project = 'MNIST1D - Model Hyper Parameters') #<! Returns the Sweep ID (String)
sweepId = wandb.sweep(dSweep, project = projName) #<! Returns the Sweep ID (String) `f96s71zs`

In [None]:
# Path

mX, valY = dsTrain[5]

hF, hA = plt.subplots(figsize = (2, 3), dpi = 90)
# hA.scatter(mX.numpy()[0], vT)
hA.plot(mX.numpy()[0], vT, 'k-', lw = 2)
hA.invert_yaxis()
hA.set_xlim(-3, 3)
hA.set_title(f'Index: {5}, Label: {valY}');

### Data Loaders


In [None]:
# Data Loader

dlTrain = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = 1 * batchSize, num_workers = numWork, persistent_workers = True)
dlTest  = torch.utils.data.DataLoader(dsVal, shuffle = False, batch_size = 2 * batchSize, num_workers = numWork, persistent_workers = True)

* <font color='red'>(**?**)</font> Why is the size of the batch twice as big for the test dataset?

In [None]:
# Iterate on the Loader
# The first batch.
tX, vY = next(iter(dlTrain)) #<! PyTorch Tensors

print(f'The batch features dimensions: {tX.shape}')
print(f'The batch labels dimensions: {vY.shape}')

## Define the Model

The model is defined as a sequential model.

In [None]:
# The L2 Pool Layer

class L2Pool1D(nn.LPPool1d):
    def __init__(self, kernelSize: int, stride: Optional[int] = None):
        super().__init__(norm_type = 2, kernel_size = kernelSize, stride = stride)

In [None]:
# Model
# Defining a sequential model.

def GenModel( numOut: int, poolLayer: Literal['maxPool', 'l2Pool'], activationLayer: Literal['ReLU', 'ReLU6', 'SELU', 'CELU', 'GELU', 'Mish', 'Softplus'] ) -> nn.Module:

    match poolLayer:
        case 'maxPool':
            oPoolLayer = nn.MaxPool1d
        case 'l2Pool':
            oPoolLayer = L2Pool1D
        case _:
            raise ValueError(f'Unknown pooling layer: {poolLayer}')
    match activationLayer:
        case 'Sigmoid':
            oActivationLayer = nn.Sigmoid
        case 'Tanh':
            oActivationLayer = nn.Tanh
        case 'ReLU':
            oActivationLayer = nn.ReLU
        case 'ReLU6':
            oActivationLayer = nn.ReLU6
        case 'SELU':
            oActivationLayer = nn.SELU
        case 'CELU':
            oActivationLayer = nn.CELU
        case 'GELU':
            oActivationLayer = nn.GELU
        case 'Mish':
            oActivationLayer = nn.Mish
        case 'Softplus':
            oActivationLayer = nn.Softplus
        case _:
            raise ValueError(f'Unknown activation layer: {activationLayer}')

    oModel = nn.Sequential(
        nn.Identity(),

        nn.Conv1d( 1,  16, 3, bias = False), nn.BatchNorm1d(16), oPoolLayer(2), oActivationLayer(),
        nn.Conv1d(16,  32, 3, bias = False), nn.BatchNorm1d(32), oPoolLayer(2), oActivationLayer(),
        nn.Conv1d(32,  64, 3, bias = False), nn.BatchNorm1d(64), oPoolLayer(2), oActivationLayer(),

        nn.AdaptiveMaxPool1d(1),
        nn.Flatten(),
        nn.Linear(64, numOut),
    )

    return oModel

oModel = GenModel(len(L_CLASSES_MNIST), 'maxPool', 'ReLU')
torchinfo.summary(oModel, tX.shape, col_names = ['kernel_size', 'output_size', 'num_params'], device = 'cpu')

* <font color='red'>(**?**)</font> Why is `bias = False` used above?
* <font color='brown'>(**#**)</font> Using a multiplication by 8 number of channels accelerate the run time (In most cases).
* <font color='brown'>(**#**)</font> Pay attention to model size and the RAM fo the GPU. Rule of thumb, up to ~40%.

## Train the Model

This section trains the model using different schedulers:

 - Updates the training function.
 - Updates the _epoch_ function to log information at mini batch level.
 - Create a class for a logger of TensorBoard.

In [None]:
# Run Device

runDevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #<! The 1st CUDA device

In [None]:
# Loss and Score Function

hL = nn.CrossEntropyLoss()
hS = MulticlassAccuracy(num_classes = len(L_CLASSES_MNIST), average = 'micro')
hL = hL.to(runDevice) #<! Not required!
hS = hS.to(runDevice)

In [None]:
# The Sweep Function

def WandbSweep( projName: str, numCls: int, hL, hS, dlTrain, dlTest, numEpochs: int, runDevice: torch.DeviceObjType ) -> None:
    """
    The WandB Sweep Function.  
    Runs a single experiment in the WandB Sweep.
    """
    # Initialize the WandB Run
    with wandb.init(project = projName) as oRun:

        # Extract the configuration
        dConfig = oRun.config
    
        # Generate the model
        oModel = GenModel(numCls, dConfig['poolLayer'], dConfig['activationLayer'])
    
        # Move the model to the run device
        oModel.to(runDevice)
    
        # Optimizer
        oOptimizer = torch.optim.AdamW(oModel.parameters(), lr = 3e-4, weight_decay = dConfig['paramLambda'])
        oScheduler = torch.optim.lr_scheduler.OneCycleLR(oOptimizer, max_lr = 5e-3, total_steps = nEpochs)
    
        # Train the model
        oModel, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate = TrainModel(oModel, dlTrain, dlTest, oOptimizer, numEpochs, hL, hS, oSch = oScheduler)

        # Set summary to represent the aggregation of values
        # For some reason, WandB does not support the `step_metric` for the summary.
        # oRun.define_metric('Epoch')
        # oRun.define_metric('Train/Loss', step_metric = 'Epoch', summary = 'last')
        # oRun.define_metric('Train/Score', step_metric = 'Epoch', summary = 'last')
        # oRun.define_metric('Train/Score', step_metric = 'Epoch',  summary = 'last')
        # oRun.define_metric('Validation/Loss', step_metric = 'Epoch', summary = 'last')
        # oRun.define_metric('Validation/Score', step_metric = 'Epoch', summary = 'last')
        # oRun.define_metric('Score', summary = 'max')
    
        for epoch in range(numEpochs):
            oRun.log({
                'Epoch'           : epoch,
                'Train/Loss'      : lTrainLoss[epoch],
                'Train/Score'     : lTrainScore[epoch],
                'Validation/Loss' : lValLoss[epoch],
                'Validation/Score': lValScore[epoch],
                'Learning Rate'   : lLearnRate[epoch],
                'Score'           : lValScore[epoch], #<! Target (Should match the `metric` name in the Sweep configuration)
            })
        
        # Set the representative value of the aggregation of the values
        oRun.summary['Score'] = max(lValScore) #<! The best score

In [None]:
# Auxiliary Function

# The function for the `agent()` must have no positional arguments
hWandbSweep = lambda: WandbSweep(projName, len(L_CLASSES_MNIST), hL, hS, dlTrain, dlTest, nEpochs, runDevice)

In [None]:
# Weights and Biases Sweep

# The function should have no arguments
# The project is required for distributed use case
wandb.agent(sweepId, project = projName, function = hWandbSweep, count = numSweeps)