[![Fixel Algorithms](https://i.imgur.com/AqKHVZ0.png)](https://fixelalgorithms.gitlab.io)

# AI Program

## Machine Learning - Supervised Learning - Classification - Feature Engineering by K-NN

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.2.000 | 09/12/2025 | Royi Avital | Optimized the loop of the distance calculation for the features    |
| 0.1.000 | 20/11/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0058ClusteringKMeans.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Miscellaneous
import math
import os
from platform import python_version
import random

# Typing
from typing import Callable, Dict, List, Optional, Self, Set, Tuple, Union
from numpy.typing import ArrayLike, NDArray

# Visualization
import matplotlib.pyplot as plt

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

```python
valToFill = ???
```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

π = math.pi

BASE_NAME   = 'FixelCourses'
DATA_FOLDER = 'DataSets'

BASE_PATH = os.getcwd()[:(len(os.getcwd()) - (os.getcwd()[::-1].lower().find(BASE_NAME.lower()[::-1])))]
DATA_PATH = os.path.join(BASE_PATH, DATA_FOLDER)

In [None]:
# Courses Packages

from DataVisualization import PlotDecisionBoundaryClosure, PlotScatterData

In [None]:
# General Auxiliary Functions

class KnnFeatExtractor(TransformerMixin, BaseEstimator):
    """
    KNN Feature Extractor
    Extracts KNN-based features from the input data.
    Generates a matrix of (N, K * C) where N is the number of samples,
    K is the number of neighbors, and C is the number of classes.
    1. For each sample, find the K nearest neighbors per class.
    2. Compute distances to these neighbors.
    3. Calculates the sum of distances to the 1:K neighbors for each class.
    """
    def __init__(self, paramK: int = 5) -> None:
        
        self.paramK = paramK
        self.lModel = [] #<! Should be set by the number of classes during fit
        self.numCls = -1
        self.vCls   = None

    def fit(self, mX: NDArray, vY: NDArray) -> Self:

        self.vCls   = np.unique(vY)
        self.numCls = len(self.vCls) #<! Assuming all classes are seen during training
        
        for _ in range(self.numCls):
            self.lModel.append(NearestNeighbors())
        
        for ii, oModel in enumerate(self.lModel):
            vIdx = vY == self.vCls[ii]
            oModel.fit(mX[vIdx])
        
        return self

    def transform(self, mX: NDArray, vY: Optional[NDArray] = None) -> NDArray:

        numFeatures = self.paramK * self.numCls
        numSamples  = np.size(mX, 0) #<! Equivalent to `mX.shape[0]`

        mF = np.zeros((numSamples, numFeatures))

        for ii, oModel in enumerate(self.lModel): #<! Per class
            mD, _ = oModel.kneighbors(mX, n_neighbors = self.paramK, return_distance = True) #<! Can be done once for `K`
            for kk in range(self.paramK): #<! Per number of neighbors
                mF[:, ii * self.paramK + kk] = np.sum(mD[:, :(kk + 1)], axis = 1) #<! Summing over k = 1, 2, ..., K
        
        return mF

def GenSpiralData(numCls: int, numSamples: int, σ: float = 0.05, numRot: float = 1.75, seedNum: Optional[int] = None) -> tuple[NDArray, NDArray]:
    """
    Generate a 2D spiral classification dataset.

    Parameters
    ----------
    numCls : int
        Number of spiral arms (classes).
    numSamples : int
        Number of samples per class.
    σ : float, optional
        Std of Gaussian noise added to the coordinates.
    numRot : float, optional
        Number of rotations.
    seedNum : int or None, optional
        Seed for reproducibility.

    Returns
    -------
    mX : Array of shape (numCls * numSamples, 2)
         The 2D points (x1, x2) of the data.
    vY : Array of shape (numCls * numSamples,)
         Class labels in {0, ..., numCls-1}.
    """
    oRng = np.random.default_rng(seedNum)

    mX = np.zeros((numCls * numSamples, 2))
    vY = np.zeros(numCls * numSamples, dtype = np.int64)

    for cc in range(numCls):
        # Velocity along the spiral arm
        # TODO: Make non linear "speed" to have more dense center
        vT = np.linspace(0.0, 1.0, numSamples, endpoint = False)

        # Radius grows linearly 0 -> 1
        vR = np.linspace(0.0, 1.0, numSamples, endpoint = False)

        # Each class has an angular offset, and the spiral turns several times
        vθ = (2.0 * np.pi * cc / numCls) + (numRot * np.pi * vT)

        # Cartesian coordinates
        vX1 = vR * np.cos(vθ)
        vX2 = vR * np.sin(vθ)

        # Add isotropic Gaussian noise
        vX1 += oRng.normal(0.0, σ, size = numSamples)
        vX2 += oRng.normal(0.0, σ, size = numSamples)

        # Store
        idxStart = cc * numSamples
        idxEnd   = (cc + 1) * numSamples
        
        mX[idxStart:idxEnd, 0] = vX1
        mX[idxStart:idxEnd, 1] = vX2
        vY[idxStart:idxEnd]    = cc

    # Ensure everything is inside [-1, 1]×[-1, 1]
    # mX = np.clip(mX, -1.0, 1.0)

    return mX, vY

## Extracting Features by K-NN

### The Distance Weighted K-NN Classifier

The Probabilistic K-NN classifier is given by:

$$
P(x_i \in y_j) = \frac{ \sum_{k = 1}^{K} \left( \frac{1}{ {d}_{ik} } \cdot( {n}_{ik} \in {y}_{j}) \right)}{\sum_{k = 1}^{K} \frac{1}{d_{ik}} }
$$

where ${x}_{i}$ is the $i$ -th sample, ${y}_{j}$ is the $j$ -th class label, ${n}_{ik}$ is the $k$ -th nearest neighbor of ${x}_{i}$ and ${d}_{ik}$ is the distance between ${x}_{i}$ and ${n}_{ik}$.  
This estimator can be thought of as a weighted voting rule, where those neighbors that are more close to ${x}_{i}$ will have more influence on predicting ${x}_{i}$’s label.

In general, the weighted estimator provides (More) calibrated probabilities when compared with the traditional estimator based on the label proportions of the nearest neighbors.  
The decision function reduces logarithmic loss (_Log Loss_).

Yet, one may use the sum of distances to $k$ neighbors of each class as a feature.

### The Features

One can create features, based on the training data and a parameter $K$ as following:

 - The the $j$ -th of each sample is the sum of distances of the sample from its $K$ nearest samples on the train data which are labeled with the $j$ -th labels.

One can create such features for various $k$ values.  

The class `KnnFeatExtractor` implements such approach.

In [None]:
# Parameters

# Data
csvFileName = 'SpiralsMulti.csv'
csvFileName = 'Spirals.csv'

trainSamplesRatio = 0.8

# Model
numClusters = 8

# Visualization
numGridPts = 501

## Generate / Load Data

In [None]:
# Load Data

dataFolderPath = os.path.join(BASE_PATH, DATA_FOLDER)
dfData         = pd.read_csv(os.path.join(dataFolderPath, csvFileName))

mX = dfData.iloc[:, :-1].to_numpy()
vY = dfData.iloc[:, -1].to_numpy() - 1  #<! Make labels start from 0

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')
print(f'The unique values of the labels: {np.unique(vY)}')

### Plot Data

In [None]:
# Plot the Data

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mX, vY, hA)
hA.set_title('Data Scatter Plot');

### Split Train / Validation

In [None]:
# Train / Validation Split

mXTrain, mXVal, vYTrain, vYVal = train_test_split(mX, vY, train_size = trainSamplesRatio, random_state = seedNum, stratify = vY)

In [None]:
# Plot the Data

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mXTrain, vYTrain, hA)
hA.set_title('Training Data Scatter Plot');

In [None]:
# Plot the Data

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mXVal, vYVal, hA)
hA.set_title('Validation Data Scatter Plot');

## Feature Engineering

This section builds 

### KNN based Feature Generation

In [None]:
# Train the Knn Feature Extractor
# The extractor is a SciKit Learn Transformer which uses SciKit Learn Nearest Neighbors functionality
oKnnFeatExt = KnnFeatExtractor(paramK = 1)
oKnnFeatExt = oKnnFeatExt.fit(mXTrain, vYTrain)

In [None]:
# Extract KNN Features (Train)

mF = oKnnFeatExt.transform(mXTrain)

In [None]:
# Display KNN Features (Train)

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mF, vYTrain, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Distance to 1st Nearest Neighbor from Class 0');
hA.set_ylabel(r'${f}_{2}$: Distance to 1st Nearest Neighbor from Class 1');

In [None]:
# Extract KNN Features (Validation)

mF = oKnnFeatExt.transform(mXVal)

In [None]:
# Display KNN Features (Validation)

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mF, vYVal, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Distance to 1st Nearest Neighbor from Class 0');
hA.set_ylabel(r'${f}_{2}$: Distance to 1st Nearest Neighbor from Class 1');

### Multi Class Case

In [None]:
# Multi Class Case
dfData         = pd.read_csv(os.path.join(dataFolderPath, 'SpiralsMulti.csv'))

mX = dfData.iloc[:, :-1].to_numpy()
vY = dfData.iloc[:, -1].to_numpy() - 1  #<! Make labels start from 0

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')
print(f'The unique values of the labels: {np.unique(vY)}')

In [None]:
# Plot the Data

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mX, vY, hA)
hA.set_title('Data Scatter Plot');

In [None]:
# Train / Validation Split

mXTrain, mXVal, vYTrain, vYVal = train_test_split(mX, vY, train_size = trainSamplesRatio, random_state = seedNum, stratify = vY)

In [None]:
# Train the Knn Feature Extractor
paramK = 2

oKnnFeatExt = KnnFeatExtractor(paramK = paramK)
oKnnFeatExt = oKnnFeatExt.fit(mXTrain, vYTrain)

In [None]:
# Extract KNN Features

# Train
mFTrain = oKnnFeatExt.transform(mXTrain)
# Validation
mFVal = oKnnFeatExt.transform(mXVal)

In [None]:
# Display KNN Features
# Distance to the closest neighbor of each class

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mFTrain[:, 0:4:2], vYTrain, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Distance to 1st Nearest Neighbor of Class 0');
hA.set_ylabel(r'${f}_{2}$: Distance to 1st Nearest Neighbor of Class 1');

In [None]:
# Display KNN Features
# Distance to the closest neighbor of each class

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mFTrain[:, 4:8:2], vYTrain, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Distance to 1st Nearest Neighbor of Class 2');
hA.set_ylabel(r'${f}_{2}$: Distance to 1st Nearest Neighbor of Class 3');

In [None]:
# Display KNN Features
# Sum of distances to the 2 closest neighbors of each class

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mFTrain[:, 1:4:2], vYTrain, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Sum distances to 1st and 2nd Nearest Neighbors from Class 0');
hA.set_ylabel(r'${f}_{2}$: Sum distances to 1st and 2nd Nearest Neighbors from Class 1');

In [None]:
# Display KNN Features
# Sum of distances to the 2 closest neighbors of each class

hF, hA = plt.subplots(figsize = (6, 6))
hA = PlotScatterData(mFTrain[:, 5:8:2], vYTrain, hA)
hA.set_title('KNN Features Scatter Plot');
hA.set_xlabel(r'${f}_{1}$: Sum distances to 1st and 2nd Nearest Neighbors from Class 2');
hA.set_ylabel(r'${f}_{2}$: Sum distances to 1st and 2nd Nearest Neighbors from Class 3');

## Linear Model

In [None]:
# Feature Transformer

paramK = 2
oKnnFeatExt = KnnFeatExtractor(paramK = paramK)

In [None]:
# Linear Model

oCls = LogisticRegression(solver = 'lbfgs')

In [None]:
# Pipeline

oModelPipe = Pipeline([('Transformer', oKnnFeatExt), ('Classifier', oCls)])
oModelPipe = oModelPipe.fit(mXTrain, vYTrain)

In [None]:
# Display Decision Boundaries

# Decision Boundary Plotter
PlotDecisionBoundary = PlotDecisionBoundaryClosure(numGridPts, -1.5, 1.5, -1.5, 1.5, lClsLabels = [0, 1, 2, 3], clsColors = lMatPltLibclr[:4])

hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
hA = PlotDecisionBoundary(oModelPipe.predict, hA = hA)
hA = PlotScatterData(mXTrain, vYTrain, hA);

In [None]:
# Validation

hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
hA = PlotDecisionBoundary(oModelPipe.predict, hA = hA)
hA = PlotScatterData(mXVal, vYVal, hA);

* <font color='green'>(**@**)</font> Evaluate the results using a score and a confusion matrix.
* <font color='green'>(**@**)</font> Update the _Feature Extractor_ to use the inverse of the sum of distances. 