[![Fixel Algorithms](https://i.imgur.com/AqKHVZ0.png)](https://fixelalgorithms.gitlab.io)

# AI Program

## Machine Learning - UnSupervised Learning - Clustering - K-Means for MNIST 1D Features

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 06/09/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0058ClusteringKMeans.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import ConfusionMatrixDisplay

from lightgbm import LGBMClassifier

# Miscellaneous
import math
import os
from platform import python_version
import random

# Typing
from typing import Callable, Dict, List, Optional, Self, Set, Tuple, Union
from numpy.typing import ArrayLike, NDArray

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

# Suppress Multi Threaded Warnings in SciKit Learn K-Means & GMM
import warnings
warnings.filterwarnings('ignore', message = '.*Windows with MKL*.')
warnings.filterwarnings('ignore', message = '.*does not have valid feature names*.')

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

TU_MNIST_IMG_SIZE = (28, 28)

D_CLASSES_MNIST = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9'}
L_CLASSES_MNIST = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

π = math.pi

In [None]:
# Courses Packages


In [None]:
# General Auxiliary Functions

def PlotMnistImages( mX: NDArray, vY: NDArray, numRows: int, numCols: Optional[int] = None, tuImgSize: Tuple = (28, 28), randomChoice: bool = True, lClasses: Optional[List] = None, hF: Optional[plt.Figure] = None ) -> plt.Figure:

    numSamples  = mX.shape[0]
    numPx       = mX.shape[1]

    if numCols is None:
        numCols = numRows

    tFigSize = (numCols * 3, numRows * 3)

    if hF is None:
        hF, hA = plt.subplots(numRows, numCols, figsize = tFigSize)
    else:
        hA = hF.axes
    
    hA = np.atleast_1d(hA) #<! To support numImg = 1
    hA = hA.flat
    
    for kk in range(numRows * numCols):
        idx = np.random.choice(numSamples) if randomChoice else kk
        mI  = np.reshape(mX[idx, :], tuImgSize)
    
        # hA[kk].imshow(mI.clip(0, 1), cmap = 'gray')
        if len(tuImgSize) == 2:
            hA[kk].imshow(mI, cmap = 'gray')
        elif len(tuImgSize) == 3:
            hA[kk].imshow(mI)
        else:
            raise ValueError(f'The length of the image size tuple is {len(tuImgSize)} which is not supported')
        hA[kk].tick_params(axis = 'both', left = False, top = False, right = False, bottom = False, 
                           labelleft = False, labeltop = False, labelright = False, labelbottom = False)
        if lClasses is None:
            hA[kk].set_title(f'Index = {idx}, Label = {vY[idx]}')
        else:
            hA[kk].set_title(f'Index = {idx}, Label = {lClasses[vY[idx]]}')
    
    return hF

def PlotLabelsHistogram( vY: NDArray, hA: Optional[plt.Axes] = None, lClass: Optional[List] = None, xLabelRot: Optional[int] = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = (8, 6))
    
    vLabels, vCounts = np.unique(vY, return_counts = True)

    hA.bar(vLabels, vCounts, width = 0.9, align = 'center')
    hA.set_title('Histogram of Classes / Labels')
    hA.set_xlabel('Class')
    hA.set_xticks(vLabels, [f'{labelVal}' for labelVal in vLabels])
    hA.set_ylabel('Count')
    if lClass is not None:
        hA.set_xticklabels(lClass)
    
    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA

def PlotConfusionMatrix(vY: NDArray, vYPred: NDArray, normMethod: str = None, hA: Optional[plt.Axes] = None, 
                        lLabels: Optional[List] = None, dScore: Optional[Dict] = None, titleStr: str = 'Confusion Matrix', 
                        xLabelRot: Optional[int] = None, valFormat: Optional[str] = None) -> Tuple[plt.Axes, NDArray]:

    # Calculation of Confusion Matrix
    mConfMat = confusion_matrix(vY, vYPred, normalize = normMethod)
    oConfMat = ConfusionMatrixDisplay(mConfMat, display_labels = lLabels)
    oConfMat = oConfMat.plot(ax = hA, values_format = valFormat)
    hA = oConfMat.ax_
    if dScore is not None:
        titleStr += ':'
        for scoreName, scoreVal in  dScore.items():
            titleStr += f' {scoreName} = {scoreVal:0.2},'
        titleStr = titleStr[:-1]
    hA.set_title(titleStr)
    hA.grid(False)
    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA, mConfMat

## Clustering by K-Means for Features

This notebook demonstrates how can 

In [None]:
# Parameters

# Data
numSamples = 5_000

# Model
numClusters = 8

# Visualization
numImg = 3

## Generate / Load Data

In [None]:
# Load Data

mX, vY = fetch_openml('mnist_784', version = 1, return_X_y = True, as_frame = False, parser = 'auto')
vY = vY.astype(np.int_) #<! The labels are strings, convert to integer

# Used only to get a stratified sub set of the data
mX, _, vY, _ = train_test_split(mX, vY, train_size = numSamples, random_state = seedNum, stratify = vY)

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')
print(f'The unique values of the labels: {np.unique(vY)}')

### Plot Data

In [None]:
# Plot the Data

hF = PlotMnistImages(mX, vY, numImg)

## Pre Processing

Applying a threshold to generate a binary image.

In [None]:
# Non Zero Pixels

vP = mX[mX > 0]

In [None]:
# Convert Image into Features Matrix

hF, hA = plt.subplots(figsize = (8, 6))
hA.hist(vP, bins = np.arange(1, 256), density = True, align = 'left')
hA.set_title('Histogram of Non Zero Pixel Values')
hA.set_xlabel('Pixel Value')
hA.set_ylabel('Count');

* <font color='brown'>(**#**)</font> One could generate a threshold per class.

In [None]:
# Binary Data

# Find the maximal threshold such that each image has at least `numClusters` non zero pixels
for valThr in range(254, 0, -1):
    mB = mX > valThr
    vNumPx = np.sum(mB, axis = 1)
    if np.min(vNumPx) >= numClusters:
        break

print(f'The threshold value is {valThr}')

mB = mX > valThr

* <font color='brown'>(**#**)</font> The _Threshold_ is a _Hyper Parameter_ which should be optimized using Cross Validation.

## Feature Engineering

Extract the coordinates of the centroids.

In [None]:
# Extract the Coordinates of Image

def ExtractCoordinates(vB: NDArray, /, *,  tuImgSize: Tuple = TU_MNIST_IMG_SIZE) -> NDArray:
    
    mI = np.reshape(vB, tuImgSize)
    vR, vC = np.nonzero(mI)
    # Feature Matrix
    mF = np.c_[vC, vR] #<! (x, y)

    return mF

In [None]:
# Cluster Model

oKMeans = KMeans(n_clusters = numClusters, random_state = seedNum)

In [None]:
# Cluster Image

rdnIdx = random.randrange(numSamples)
vB = mB[rdnIdx]

mF = ExtractCoordinates(vB)
oKMeans = oKMeans.fit(mF)
vL = oKMeans.labels_
mC = oKMeans.cluster_centers_

In [None]:
# Plot Clustered Image

mI = np.reshape(vB, TU_MNIST_IMG_SIZE)
hF, hA = plt.subplots(figsize = (4, 4))
hA.imshow(mI, cmap = 'gray')
hA.scatter(mF[:, 0], mF[:, 1], c = vL, s = MARKER_SIZE_DEF, cmap = 'tab10', edgecolor = 'k')
hA.set_title(f'Clustered Image (Index = {rdnIdx})')
hA.axis('off');

In [None]:
# Plot Centroids Image

mI = np.reshape(vB, TU_MNIST_IMG_SIZE)
hF, hA = plt.subplots(figsize = (4, 4))
hA.imshow(mI, cmap = 'gray')
hA.scatter(mC[:, 0], mC[:, 1], c = 'yellow', s = MARKER_SIZE_DEF * 10, marker = 'X', edgecolor = 'k')
hA.set_title(f'Clustered Image (Index = {rdnIdx})')
hA.axis('off');

In [None]:
# Extract Features Matrix for All Images

def ExtractFeaturesMatrix(mB: NDArray, numClusters: int, /, *, tuImgSize: Tuple = TU_MNIST_IMG_SIZE, seedNum: int = seedNum) -> List[NDArray]:

    numSamples = mB.shape[0]
    mF         = np.zeros((numSamples, numClusters * 2)) #<! Chain coordinates
    
    oKMeans = KMeans(n_clusters = numClusters, random_state = seedNum)
    
    for ii in range(numSamples):
        vB = mB[ii]
        mFi = ExtractCoordinates(vB, tuImgSize = tuImgSize)
        oKMeans = oKMeans.fit(mFi)
        mCi = oKMeans.cluster_centers_
        mF[ii] = mCi.flatten()
    
    return mF

* <font color='brown'>(**#**)</font> In practice there are $K!$ permutations for the order of the _Centroids_.  
One should create a stable way to choose a specific permutation: Minimum length, minimum reconstruction error, etc...

In [None]:
# Generate the Feature Matrix

mF = ExtractFeaturesMatrix(mB, numClusters)

In [None]:
# Classifier
oCls = LGBMClassifier()
oCls = oCls.fit(mF, vY)

clsScore = oCls.score(mF, vY)
print(f'The classifier score on the training data is {clsScore:0.2%}')

In [None]:
# Confusion Matrix

vYPred = oCls.predict(mF)

hF, hA = plt.subplots(figsize = (7, 7))
PlotConfusionMatrix(vY, vYPred, normMethod = 'true', hA = hA, lLabels = L_CLASSES_MNIST, 
                    dScore = {'Accuracy': accuracy_score(vY, vYPred)}, titleStr = 'Confusion Matrix', xLabelRot = 45, valFormat = '.2f');

* <font color='green'>(**@**)</font> Use Cross Validation to optimize the _Hyper Parameters_ of the model and prevent _Overfit_.