[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io/)

# Machine Learning Methods

## Supervised Learning - Classification - Train a Linear Classifier (Gradient Descent)

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.1.000 | 17/09/2022 | Royi Avital | First version                                                      |
|         |            |             |                                                                    |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethods/2023_01/0005ClassifierLinearTrain.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.datasets import load_breast_cancer, make_circles, make_moons

# Misc
import datetime
import os
from platform import python_version
import random
import warnings
import yaml

# Typing
from typing import Tuple

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image, display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF = (8, 8)
ELM_SIZE_DEF = 50
CLASS_COLOR = ('b', 'r')


In [None]:
# Fixel Algorithms Packages


In [None]:
# Parameters

# Data Generation
numSamples = 500
noiseLevel = 0.1

# Data Visualization
figSize     = (8, 8)
elmSize     = 50
classColor0 = 'b'
classColor1 = 'r'

numGridPts = 250

In [None]:
# Auxiliary Functions

def PlotBinaryClassData( mX: np.ndarray, vY: np.ndarray, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, elmSize: int = ELM_SIZE_DEF, classColor: Tuple[str, str] = CLASS_COLOR, axisTitle: str = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()
    
    vC, vN = np.unique(vY, return_counts = True)

    numClass = len(vC)
    if (len(vC) != 2):
        raise ValueError(f'The input data is not binary, the number of classes is: {numClass}')

    vIdx0 = vY == vC[0]
    vIdx1 = vY == vC[1] #<! Basically ~vIdx0

    hA.scatter(mX[vIdx0, 0], mX[vIdx0, 1], s = elmSize, color = classColor[0], edgecolor = 'k', label = f'$C_\u007b {vC[0]} \u007d$')
    hA.scatter(mX[vIdx1, 0], mX[vIdx1, 1], s = elmSize, color = classColor[1], edgecolor = 'k', label = f'$C_\u007b {vC[1]} \u007d$')
    hA.axvline(x = 0, color = 'k')
    hA.axhline(y = 0, color = 'k')
    hA.axis('equal')
    if axisTitle is not None:
        hA.set_title(axisTitle)
    hA.legend()
    
    return hA


def PlotLinearClassifier(mX: np.ndarray, vW: np.ndarray, vY: np.ndarray, mX1, mX2, hA):
    b = vW[0]
    vW = vW[1:]
    XX = np.column_stack([mX1.flatten(), mX2.flatten()])

    vZ = (XX @ vW - b) > 0
    ZZ = vZ.reshape(mX1.shape)
    
    vHatY    = np.sign(mX @ vW - b)
    accuracy = np.mean(vY == vHatY)

    axisTitle = r'$f_{{w},b} \left( {x} \right) = {sign} \left( {w}^{T} {x} - b \right)$' '\n' f'Accuracy = {accuracy:.2%}'

    PlotBinaryClassData(mX, vY, hA = hA, axisTitle = axisTitle)
    v = np.array([-2, 2])
    hA.grid(True)
    hA.plot(v, -(vW[0] / vW[1]) * v + (b / vW[1]), color = 'k', lw = 3)
    hA.arrow(0, 0, vW[0], vW[1], color = 'orange', width = 0.05)
    hA.axvline(x = 0, color = 'k', lw = 1)
    hA.axhline(y = 0, color = 'k', lw = 1)
    hA.contourf(mX1, mX2, ZZ, colors = CLASS_COLOR, alpha = 0.2, levels = [-0.5, 0.5, 1.5], zorder = 0)
    
    hA.set_xlim([-2, 2])
    hA.set_ylim([-2, 2])
    hA.set_xlabel('$x_1$')
    hA.set_ylabel('$x_2$')
    
    # plt.show()

## Generate / Load Data

We'll use the the classic _moons_ data set.  
By default it labels the data ${y}_{i} \in \left\{ 0, 1 \right\}$.  
We'll transform it into ${y}_{i} \in \left\{ -1, 1 \right\}$.


In [None]:
# Generate Data 
mX, vY = make_moons(n_samples = numSamples, noise = noiseLevel)

print(f'The features data shape: {mX.shape}')
print(f'The labels data shape: {vY.shape}')

In [None]:
# The labels of the data
print(f'The unique values of the labels: {np.unique(vY)}')

* <font color='red'>(**?**)</font> Do the labels fit our model? What should we do?

In [None]:
# Transforming the Labels into {-1, 1}
vY[vY == 0] = -1

In [None]:
# The updated labels
print(f'The unique values of the labels: {np.unique(vY)}')

### Plot Data

In [None]:
# Display the Data

hA = PlotBinaryClassData(mX, vY, axisTitle = 'Training Set')

## Linear Classifier Training

$$ {f}_{\left( \boldsymbol{w} \right)} \left( \boldsymbol{x} \right) = \mathrm{sign} \left( \boldsymbol{w}^{T} \boldsymbol{x} \right) $$



### Training Optimization Problem

In ideal world, we'd like to optimize:

$$ \hat{ \boldsymbol{w} } = \arg \min_{\boldsymbol{w}} {\left\| \operatorname{sign} \left( X \boldsymbol{w} \right) - \boldsymbol{y} \right\|}_{2}^{2} $$

Where

$$
X = \begin{bmatrix} -1 & - & x_{1} & -\\
-1 & - & x_{2} & -\\
 & \vdots \\
-1 & - & x_{N} & -
\end{bmatrix} \in \mathbb{R}^{N \times 3}
$$

In [None]:
# Stack the constant column into `mX`
mX = np.column_stack((-np.ones(numSamples), mX))

* <font color='red'>(**?**)</font> What are the dimensions of `mX`?

In [None]:
# The updated dimensions
print(f'The features data shape: {mX.shape}')

Yet, since the $\operatorname{sign} \left( \cdot \right)$ isn't smooth nor continuous we need to approximate it.  
The classic candidate is the [Sigmoid Function](https://en.wikipedia.org/wiki/Sigmoid_function) (Member of the _S Shaped_ function family):

$$ \sigma \left( x \right) = 2 \frac{ \exp \left( x \right) }{ 1 + \exp \left( x \right) } - 1 = 2 \frac{ 1 }{ 1 + \exp \left( -x \right) } - 1 $$

See [`scipy.special.expit()`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.expit.html) for $\frac{ 1 }{ 1 + \exp \left( -x \right) }$.

<font color='brown'>(**#**)</font> In practice such function requires numerical stable implementation. Use professionally made implementations if available. 

The Sigmoid Function derivative is given by:

$$ \frac{\mathrm{d} \sigma \left( x \right) }{\mathrm{d} x} = 2 \frac{ \exp \left( x \right)}{\left( 1 + \exp \left( x \right) \right)^{2}} = 2 \left( \frac{ 1 }{ 1 + \exp \left( -x \right) } \right) \left( 1 - \frac{ 1 }{ 1 + \exp \left( -x \right) } \right) $$

<font color='brown'>(**#**)</font> For derivation of the last step, see https://math.stackexchange.com/questions/78575.

### The Loss Function

Then, using the Sigmoid approximation the loss function becomes (With mean over all data samples $N$):

$$ \hat{ \boldsymbol{w} } = \arg \min_{\boldsymbol{w}} J \left( \boldsymbol{w} \right) = \arg \min_{\boldsymbol{w}} \frac{1}{4 N} {\left\| \sigma \left( \boldsymbol{X} \boldsymbol{w} \right) - \boldsymbol{y} \right\|}_{2}^{2} $$

The gradient becomes:

$$\nabla_{\boldsymbol{w}} J \left( \boldsymbol{w} \right) = \frac{1}{2N} \boldsymbol{X}^{T} \operatorname{Diag} \left( \sigma' \left( \boldsymbol{X} \boldsymbol{w} \right) \right) \left( \sigma \left( \boldsymbol{X} \boldsymbol{w}\right) - \boldsymbol{y} \right) $$

In [None]:
# Defining the Functions

def SigmoidFun( vX: np.ndarray ):
    
    return (2 * sp.special.expit(vX)) - 1

def GradSigmoidFun(vX: np.ndarray):

    vExpit = sp.special.expit(vX)
    
    return 2 * vExpit * (1 - vExpit)

def LossFun(mX: np.ndarray, vW: np.ndarray, vY: np.ndarray):

    numSamples = mX.shape[0]

    vR = SigmoidFun(mX @ vW) - vY
    
    return np.sum(np.square(vR)) / (4 * numSamples)

def GradLossFun(mX: np.ndarray, vW: np.ndarray, vY: np.ndarray):

    numSamples = mX.shape[0]
    
    return (mX.T * GradSigmoidFun(mX @ vW).T) @ (SigmoidFun(mX @ vW) - vY) / (2 * numSamples)

### The Gradient Descent

$$ \boldsymbol{w}_{k + 1} = \boldsymbol{w}_{k} - \mu \nabla_{\boldsymbol{w}} J \left( \boldsymbol{w}_{k} \right) $$



In [None]:
# Gradient Descent

# Parameters
K   = 1000 #<! Num Steps
µ   = 0.10 #<! Step Size
vW  = np.array([0.0, -1.0, 2.0]) #<! Initial w

mW = np.zeros(shape = (vW.shape[0], K)) #<! Model Parameters (Weights)
vE = np.full(shape = K, fill_value = None) #<! Errors
vL = np.full(shape = K, fill_value = None) #<! Loss

vHatY = np.sign(mX @ vW) #<! Apply the classifier

mW[:, 0]    = vW
vE[0]       = np.mean(vHatY != vY)
vL[0]       = LossFun(mX, vW, vY)

for kk in range(1, K):
    vW -= µ * GradLossFun(mX, vW, vY)
    
    mW[:, kk]   = vW

    vHatY = np.sign(mX @ vW) #<! Apply the classifier
    
    vE[kk]      = np.mean(vHatY != vY) #<! Mean Error
    vL[kk]      = LossFun(mX, vW, vY) #<! Loss Function

In [None]:
# Plotting Function

# Grid of the data support
vV       = np.linspace(-2, 2, numGridPts)
mX1, mX2 = np.meshgrid(vV, vV)

def PlotLinClassTrain(itrIdx, mX, mW, vY, K, µ, vE, vL, mX1, mX2):

    hF, _ = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))

    hA1, hA2 = hF.axes[0], hF.axes[1]

    # hA1.cla()
    # hA2.cla()
    
    PlotLinearClassifier(mX, mW[:, itrIdx], vY, mX1, mX2, hA1)

    vEE = vE[:itrIdx]
    vLL = vL[:itrIdx]

    hA2.plot(vEE, color = 'k', lw = 2, label = r'$J \left( w \right)$')
    hA2.plot(vLL, color = 'm', lw = 2, label = r'$\tilde{J} \left( w \right)$')
    hA2.set_title('Objective Function')
    hA2.set_xlabel('Iteration Index')
    hA2.set_ylabel('Value')
    hA2.set_xlim((0, K - 1))
    hA2.set_ylim((0, 1))
    hA2.grid()
    hA2.legend()
        
    # hF.canvas.draw()
    plt.show()

In [None]:
# Display the Optimization Path
# hF, hA = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))
# hPlotLinClassTrain = lambda itrIdx: PlotLinClassTrain(itrIdx, mX, mW, vY, K, µ, vE, vL, mX1, mX2, hF)
hPlotLinClassTrain = lambda itrIdx: PlotLinClassTrain(itrIdx, mX[:, 1:], mW, vY, K, µ, vE, vL, mX1, mX2)
kSlider = IntSlider(min = 0, max = K - 1, step = 1, value = 0, layout = Layout(width = '30%'))
interact(hPlotLinClassTrain, itrIdx = kSlider)

# plt.show()


<font color='blue'>(**!**)</font> Optimize the parameters $K$ and $\mu$ to achieve accuracy of `~85%` with the least steps.