[![Fixel Algorithms](https://i.imgur.com/AqKHVZ0.png)](https://fixelalgorithms.gitlab.io)

# AI Program

## Machine Learning - UnSupervised Learning - Manifold Learning - UMAP

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 25/09/2025 | Royi Avital | Added PCA example                                                  |
| 1.0.000 | 13/09/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0067ManifoldLearningIsoMap.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from umap import UMAP

# Miscellaneous
import math
from platform import python_version
import random

# Typing
from typing import Callable, Dict, List, Optional, Self, Set, Tuple, Union
from numpy.typing import NDArray

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt

# Jupyter
from IPython import get_ipython
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

In [None]:
# Courses Packages


In [None]:
# General Auxiliary Functions

def PlotMnistImages( mX: NDArray, vY: NDArray, numRows: int, /, *, numCols: Optional[int] = None, tuImgSize: Tuple = (28, 28), randomChoice: bool = True, lClasses: Optional[List] = None, hF: Optional[plt.Figure] = None ) -> plt.Figure:

    numSamples  = mX.shape[0]
    numPx       = mX.shape[1]

    if numCols is None:
        numCols = numRows

    tFigSize = (numCols * 3, numRows * 3)

    if hF is None:
        hF, vHa = plt.subplots(numRows, numCols, figsize = tFigSize)
    else:
        vHa = hF.axes
    
    vHa = np.atleast_1d(vHa) #<! To support numImg = 1
    vHa = vHa.flat
    
    for kk in range(numRows * numCols):
        idx = np.random.choice(numSamples) if randomChoice else kk
        mI  = np.reshape(mX[idx, :], tuImgSize)
    
        # hA[kk].imshow(mI.clip(0, 1), cmap = 'gray')
        if len(tuImgSize) == 2:
            vHa[kk].imshow(mI, cmap = 'gray')
        elif len(tuImgSize) == 3:
            vHa[kk].imshow(mI)
        else:
            raise ValueError(f'The length of the image size tuple is {len(tuImgSize)} which is not supported')
        vHa[kk].tick_params(axis = 'both', left = False, top = False, right = False, bottom = False, 
                           labelleft = False, labeltop = False, labelright = False, labelbottom = False)
        if lClasses is None:
            vHa[kk].set_title(f'Index = {idx}, Label = {vY[idx]}')
        else:
            vHa[kk].set_title(f'Index = {idx}, Label = {lClasses[vY[idx]]}')
    
    return hF

def PlotLabelsHistogram( vY: NDArray, hA: Optional[plt.Axes] = None, lClass: Optional[List] = None, xLabelRot: Optional[int] = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = (8, 6))
    
    vLabels, vCounts = np.unique(vY, return_counts = True)

    hA.bar(vLabels, vCounts, width = 0.9, align = 'center')
    hA.set_title('Histogram of Classes / Labels')
    hA.set_xlabel('Class')
    hA.set_xticks(vLabels, [f'{labelVal}' for labelVal in vLabels])
    hA.set_ylabel('Count')
    if lClass is not None:
        hA.set_xticklabels(lClass)
    
    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA

def PlotScatterData( mX: NDArray, vL: Optional[NDArray] = None, lLabel: Optional[List[str]] = None, hA: Optional[plt.Axes] = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = ELM_SIZE_DEF, edgeColor: str = EDGE_COLOR ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()
    
    numSamples = mX.shape[0]

    if vL is None:
        vL = np.zeros(numSamples)
    
    if lLabel is None:
        lLabel = [str(ii) for ii in range(int(np.max(vL)) + 1)] 
    
    vU = np.unique(vL)
    numClusters = len(vU)

    for ii in range(numClusters):
        vIdx = vL == vU[ii]
        hA.scatter(mX[vIdx, 0], mX[vIdx, 1], s = markerSize, edgecolor = edgeColor, label = lLabel[ii])
    
    hA.set_xlabel('${{x}}_{{1}}$')
    hA.set_ylabel('${{x}}_{{2}}$')
    hA.grid()
    hA.legend()

    return hA

## Dimensionality Reduction

Given a data set $\mathcal{X} = \{ \boldsymbol{x}_{i} \in \mathbb{R}^{D} \}_{i = 1}$ _Dimensionality Reduction_ seeks for $\mathcal{Z} = \{ \boldsymbol{z}_{i} \in \mathbb{R}^{d} \}_{i = 1}$ where $d \leq D$ and some properties of the data are preserved.  
The _Dimensionality Reduction_ function maps each sample from the high dimensional space to a sample in the low dimensional space.  

Motivations for Dimensionality Reduction
 * **Feature Engineering**  
   Dimensionality reduction may assist algorithms "focus" on the important relations between samples.  
   Specifically, for High Dimensional Data it may reduce issues caused by "Curse of Dimensionality" and make the distance useful.  
   <font color='magenta'>Example</font>: A common synergy is _Dimensionality Reduction_ followed by _Clustering_.  
   <font color='magenta'>Example</font>: Using PCA to enhance the data SNR.
 * **Visualization**  
   High Dimensional data can not be visualized geometrically.  
   Yet such analysis is important in the _EDA_ and _Results Analysis_.  
   _Dimensionality Reduction_ to a 2D / 3D, with the assumption the important relations are well kept in the process, may assist with the analysis.
 * **Run Time Optimization**  
   Most algorithms have a run time complexity dependency on the dimension of the data.  
   Dimensionality reduction, by nature, will accelerate such cases.  
   <font color='magenta'>Example</font>: Using PCA to reduce 50% of the dimensions while preserving 95% of the energy.

Broadly, _Dimensionality Reduction_ is split into 2 approaches:

 * Linear  
   The low dimension data is a linear / affine function of the high dimensional data: $\boldsymbol{z}_{i} = \boldsymbol{W} \boldsymbol{x}_{i} + \boldsymbol{b}$.  
   The matrix and bias vector are the solution of the algorithm.  
   Methods such as [Principle Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (PCA), [Independent Component Analysis](https://en.wikipedia.org/wiki/Independent_component_analysis) ICA, [Canonical Correlation Analysis](https://en.wikipedia.org/wiki/Canonical_correlation) (CCA).
 * Non Linear  
   The low dimension sample is a non linear function of the high dimension sample.  
   Methods such as [Kernel PCA](https://en.wikipedia.org/wiki/Kernel_principal_component_analysis), [MultiDimensional Scaling](https://en.wikipedia.org/wiki/Multidimensional_scaling).  
   Some Non Linear Methods are called _Manifold Learning_ which try to learn the local metric of the data: Laplacian Eigenmaps, [T-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) and UMAP.

<br>

* <font color='brown'>(**#**)</font> One may apply Dimensionality Reduction on labeled data.  
There are specialized algorithms for such case, the most known is the [Linear Discriminant Analysis](https://en.wikipedia.org/wiki/Linear_discriminant_analysis) (LDA) algorithm.
* <font color='brown'>(**#**)</font> _Dimensionality Reduction_ is potentially the _Optimal Feature Engineering_.  
* <font color='brown'>(**#**)</font> _Feature Selection_ is also a type of _Dimensionality Reduction_.
* <font color='brown'>(**#**)</font> One may use _Auto Encoder_ like models to apply Deep Learning based _Dimensionality Reduction_.

### Comparison of Methods

|                     	| PCA                          	| t-SNE (SciKit Learn) 	| UMAP                             	| Remarks                                                              	|
|---------------------	|------------------------------	|----------------------	|----------------------------------	|----------------------------------------------------------------------	|
| Type                	| Linear                       	| Non Linear           	| Non Linear                       	| `t-SNE` and `UMAP` are Manifold Learning based on Neighborhood Graph 	|
| Objective           	| Maximize Energy Preservation 	| Random Walk Graph    	| Uniform Distance on the Manifold 	|                                                                      	|
| Preserves           	| Global                       	| Local                	| Local + Global (Some)            	| `UMAP` global preservation is mainly due to better initialization     |
| Speed               	| ✈ Fast                       	| 🐢 Slow               	| 🚗 Medium                         	| `OpenTSNE` is faster than `UMAP`                        	|
| Use in ML Pipeline  	| ✅ Yes                        	| ❌ No                 	| ✅ Yes                            	| `OpenTSNE` adds _Out of Sample_ support (`transform()` method)       	|
| Clustering Friendly 	| ❌ No                         	| ✅ Yes                	| ✅ Yes                            	| Clustering for the EDA phase                                         	|
| Interpretability    	| ✅ High                       	| ❌ Low                	| ❌ Low                            	|                                                                      	|
| Inverse Transform   	| ✅ Yes                        	| ❌ No                 	| ❌ No                             	|                                                                      	|


* <font color='brown'>(**#**)</font> One may train / fit a _Parametric Model_ on the results of non parametric models (`t-SNE` / `UMAP`) in order to have _Out of Sample Support_.  
  See [Laurens van der Maaten - Learning a Parametric Embedding by Preserving Local Structure](https://proceedings.mlr.press/v5/maaten09a.html) which is an extension to `t-SNE` by the `t-SNE` author.

In [None]:
# Parameters

# Dimensionality Reduction
dataDim = 2
numOutSampleSamples = 20

# Visualization
numImg = 5

## Generate / Load Data

The notebook used [SciKit Learn's Digits Dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html).  
It is similar to the MNIST dataset where the images are smaller (8 x 8) and the number of samples is 1797.

In [None]:
# Generate / Load Data

mX, vY = load_digits(return_X_y = True)
numSamples = mX.shape[0]
numCls = len(np.unique(vY))

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')
print(f'The unique values of the labels: {np.unique(vY)}')

In [None]:
# Plot the Data
numImg = 3

hF, _ = plt.subplots(numImg, numImg, figsize = (4, 4))
hF = PlotMnistImages(mX, vY, numImg, tuImgSize = (8, 8), randomChoice = True, hF = hF)

In [None]:
# Plot the Labels Histogram
hF, hA = plt.subplots(figsize = (8, 6))
hA = PlotLabelsHistogram(vY, hA = hA);

## Dimensionality Reduction (Manifold Learning) by PCA

The [`PCA()`](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) class is part of SciKit's [`decomposition` module](https://scikit-learn.org/stable/api/sklearn.decomposition.html).  

The PCA algorithm finds the directions which preserves most energy (Explain most variance) of the data.  
Then it projects the data on those vectors.

The main parameters:

 - `n_components` - Sets the number of dimensions og the low dimension data.  
   If `n_components` set to a number in $\left[ 0, 1 \right)$ range the number of components will automatically calculated to keep `n_components` percent of the variance.
 - `whiten` - Ensures the transformed data is white and has unit variance in each direction. Usually not used as it will lose scaling information (Ratio of values between data).

In [None]:
# Dimensionality Reduction with PCA

oDimRed = PCA(n_components = dataDim)
mZ      = oDimRed.fit_transform(mX)

print(f'The explained variance ratio by the first {dataDim} components: {oDimRed.explained_variance_ratio_}')
print(f'The sum of the explained variance ratio by the first {dataDim} components: {np.sum(oDimRed.explained_variance_ratio_):0.2%}')

In [None]:
# Plot the PCA Result
hF, hA = plt.subplots(figsize = (6, 6))
PlotScatterData(mZ, vY, hA = hA)

hA.set_title(f'PCA: Explained Variance: {np.sum(oDimRed.explained_variance_ratio_):0.2%}');

In [None]:
# Split Data
vDataIdx, vOutOfSampleIdx = train_test_split(range(numSamples),test_size = numOutSampleSamples, random_state = seedNum, stratify = vY)

In [None]:
# Fit a Model
mZData = oDimRed.fit_transform(mX[vDataIdx, :]) #<! In Sample Data
mZOut = oDimRed.transform(mX[vOutOfSampleIdx, :]) #<! Out of Sample Data

In [None]:
# Out of Sample Data Results
hF, hA = plt.subplots(figsize = (6, 6))
PlotScatterData(mZData, vY[vDataIdx], hA = hA)
for hChild in hA.get_children():
    if isinstance(hChild, mpl.collections.PathCollection):
        hChild.set_alpha(0.075)
PlotScatterData(mZOut, vY[vOutOfSampleIdx], hA = hA)
hA.set_title(f'PCA: In Sample and Out of Sample Data');
hA.legend().set_visible(False)

In [None]:
# Explained Variance

oDimRed = PCA(n_components = 1 - math.ulp(1.0)) #<! Number must less than 1
mZ      = oDimRed.fit_transform(mX)

print(f'The number of components to explain 100% variance: {oDimRed.n_components_}')

* <font color='red'>(**?**)</font> Explain the number of components. What limits the number components?

In [None]:
# Plot the Variance per Component

hF, hA = plt.subplots(figsize = (8, 6))
hA.bar(range(1, len(oDimRed.explained_variance_ratio_) + 1), oDimRed.explained_variance_ratio_, width = 0.9, align = 'center')
hA.set_xlabel('Component Index')
hA.set_ylabel('Variance Explained')
hA.set_title('Variance Explained by Each Component');

In [None]:
# Plot the Variance per Component

hF, hA = plt.subplots(figsize = (8, 6))
hA.bar(range(1, len(oDimRed.explained_variance_ratio_) + 1), oDimRed.explained_variance_ratio_, width = 0.9, align = 'center')
hA.plot(range(1, len(oDimRed.explained_variance_ratio_) + 1), np.cumsum(oDimRed.explained_variance_ratio_), 'r-o', label = 'Cumulative Variance')
hA.set_xlabel('Component Index')
hA.set_ylabel('Variance Explained')
hA.set_title('Variance Explained by Each Component');

## Dimensionality Reduction (Manifold Learning) by UMAP

![](https://i.imgur.com/VXW337r.png)
<!-- ![](https://i.postimg.cc/P5X5kgfZ/Diagrams-UMAP002.png) -->

The UMAP algorithm is available through the [`UMAP`](https://github.com/lmcinnes/umap) package.  
It implements the known `fit()`, `transform()` API of SciKit Learn.

The main parameters:

 - `n_components` - Sets the number of dimensions og the low dimension data.
 - `n_neighbors` - Balances between local structure (Small number) and global structure (Large number).
 - `min_dist` - Sets the density of the points in low dimension.
 - `metric` - Defines the metric to use to evaluate the distance between samples.

In [None]:
# Plot Dimensionality Reduction Results

def PlotDimRedResults( mX: NDArray, vL: NDArray, /, *, numNeighbors: int = 15, minDist: float = 0.1, metricType: str = 'euclidean' ) -> None:
    
    oDimRed = UMAP(n_components = dataDim, n_neighbors = numNeighbors, min_dist = minDist, metric = metricType)
    mZ = oDimRed.fit_transform(mX)

    hF, hA = plt.subplots(figsize = (6, 6))
    PlotScatterData(mZ, vY, hA = hA)

    hA.set_title(f'UMAP: numNeighbors = {numNeighbors}, minDist = {minDist}, metric = {metricType}')

In [None]:
# Auxiliary Function

hPlotDimRedResults = lambda numNeighbors, minDist, metricType: PlotDimRedResults(mX, vY, numNeighbors = numNeighbors, minDist = minDist, metricType = metricType)

In [None]:
# Interactive Plot
numNeighborsSlider = IntSlider(min = 2, max = 21, step = 1, value = 15, continuous_update = False, layout = Layout(width = '30%'))
minDistSlider = FloatSlider(min = 0.05, max = 1.0, step = 0.05, value = 0.1, continuous_update = False, layout = Layout(width = '30%'))
metricDropdown = Dropdown(options = ['euclidean', 'manhattan', 'cosine'], continuous_update = False, value = 'euclidean', layout = Layout(width = '30%'))
interact(hPlotDimRedResults, numNeighbors = numNeighborsSlider, minDist = minDistSlider, metricType = metricDropdown);

### Out of Sample Support  

Many of the advanced Manifold Learning methods are _Non Parameteric_ (t-SNE, UMAP).  
Hence, their ability to support _Out of Sample_ data is tricky.  

One way to overcome it is to built a _Regression Model_ to learn a parametric model of the transformation.  
Yet _UMAP_ offers a native support of Out of Sample data (So does `OpenTSNE`) wth its `transform()` method.

In [None]:
# Split Data
vDataIdx, vOutOfSampleIdx = train_test_split(range(numSamples),test_size = numOutSampleSamples, random_state = seedNum, stratify = vY)

In [None]:
# Fit a Model
oDimRed = UMAP(n_components = dataDim, n_neighbors = 10, min_dist = 0.25, metric = 'euclidean')
mZData = oDimRed.fit_transform(mX[vDataIdx, :]) #<! In Sample Data
mZOut = oDimRed.transform(mX[vOutOfSampleIdx, :]) #<! Out of Sample Data

In [None]:
# Out of Sample Data Results
hF, hA = plt.subplots(figsize = (6, 6))
PlotScatterData(mZData, vY[vDataIdx], hA = hA)
for hChild in hA.get_children():
    if isinstance(hChild, mpl.collections.PathCollection):
        hChild.set_alpha(0.025)
PlotScatterData(mZOut, vY[vOutOfSampleIdx], hA = hA)
hA.set_title(f'UMAP: In Sample and Out of Sample Data');
hA.legend().set_visible(False)

In [None]:
# Image for Slides

# import seaborn as sns
# sns.set_theme(style = 'ticks', context = 'talk')
# plt.style.use('dark_background') 

# hF, hA = plt.subplots(figsize = (6, 6))
# lLegLab = ['_Ignore'] * numCls
# PlotScatterData(mZData, vY[vDataIdx], lLabel = lLegLab, hA = hA)
# for hChild in hA.get_children():
#     if isinstance(hChild, mpl.collections.PathCollection):
#         hChild.set_alpha(0.035)
# PlotScatterData(mZOut, vY[vOutOfSampleIdx], hA = hA)
# # hA.set_title(f'UMAP: In Sample and Out of Sample Data');
# hA.legend(loc = 'upper center', ncol = numCls, fontsize = '6')
# hA.set_xticks([])
# hA.set_yticks([])
# hF.set_tight_layout(True)

# hF.savefig('TMP.svg', transparent = True)