[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io/)

# Dimensionality Reduction - Principal Component Analysis (PCA)

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.1.000 | 01/10/2022 | Royi Avital | First version                                                      |
|         |            |             |                                                                    |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/IntroductionMachineLearningSystemEngineers/DimensionalityReductionPCA.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.datasets import fetch_olivetti_faces, fetch_openml, load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures

# Misc
import datetime
import math
import os
from platform import python_version
import random
import warnings
import yaml

# Typing
from typing import Tuple

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image, display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# sns.set_theme() #>! Apply SeaBorn theme
sns.set_palette("tab10")

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
if runInGoogleColab:
    !pip install git+https://github.com/8080labs/ppscore.git

import ppscore as pps #<! See https://github.com/8080labs/ppscore -> pip install git+https://github.com/8080labs/ppscore.git

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2


In [None]:
# Fixel Algorithms Packages


In [None]:
# Parameters

# Data Generation
vNumSamples = [50, 150, 500, 100]
mMu         = [[0, 0], [2, 2], [-2.5, -2.5], [-4, 4]]
vClusterStd = [0.1, 1, 2, 1.5]

# Model

numCrossValPps = 5


# Data Visualization
gridSclae = 5
numGridPts = 250

In [None]:
# Auxiliary Functions

OrdinalNum = lambda n: "%d%s" % (n,"tsnrhtdd"[(math.floor(n/10)%10!=1)*(n%10<4)*n%10::4])

def GenRotMatrix( θ: float ) -> np.ndarray:
    thetaAng = np.radians(θ) #<! Convert Degrees -> Radians
    cosVal, sinVal = np.cos(thetaAng), np.sin(thetaAng)

    mR = np.array([[cosVal, -sinVal], [sinVal, cosVal]])

    return mR

def PlotScatterData(mX: np.ndarray, vL: np.ndarray, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = MARKER_SIZE_DEF, lineWidth: int = LINE_WIDTH_DEF, axisTitle: str = None):

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()
    
    vU = np.unique(vL)
    numClusters = len(vU)

    for ii in range(numClusters):
        vIdx = vL == vU[ii]
        hA.scatter(mX[vIdx, 0], mX[vIdx, 1], s = ELM_SIZE_DEF, edgecolor = EDGE_COLOR, label = ii)
    
    hA.set_xlabel('${{x}}_{{1}}$')
    hA.set_ylabel('${{x}}_{{2}}$')
    if axisTitle is not None:
        hA.set_title(axisTitle)
    hA.grid()
    hA.legend()

    # return hF


def PlotPcaReconstruction( mX: np.ndarray, dataIdx: int, mU: np.ndarray, vMean: np.ndarray, numComp:int, vSize: np.ndarray, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = MARKER_SIZE_DEF ):

    if hA is None:
        hF, hA = plt.subplots(1, 3, figsize = figSize)
    else:
        hF = hA.get_figure()

    vX = mX[dataIdx, :]

    if numComp == 0:
        vZ    = [0]
        vHatX = vMean
    else:
        vZ    = mU[:numComp]   @ (vX - vMean) #<! Encode
        vHatX = (mU[:numComp].T @  vZ) + vMean  #<! Decode
        
    mI   = np.reshape(vX,    vSize).clip(0, 1)
    mRec = np.reshape(vHatX, vSize).clip(0, 1)

    hA[0].imshow(mI, cmap = 'gray');
    hA[0].set_title('Original Image')

    hA[1].imshow(mRec, cmap = 'gray');
    hA[1].set_title(f'Reconstructed Image, # Componenets: {numComp}')

    hA[2].stem(vZ, markerfmt = 'b.', label = 'Coefficients')
    hA[2].set_xlabel('Principal Component')
    hA[2].set_ylabel('Coefficient Value')


## Generate Data


Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.

![](https://i.imgur.com/4LE2biE.png)

In [None]:
mX, vY      = load_breast_cancer(return_X_y = True)
dfX, dsY    = load_breast_cancer(return_X_y = True, as_frame = True)


In [None]:
# Add Label Data
dfX['Label'] = pd.Categorical(dsY)


### Exploratory Data Analysis (EDA)

#### Correlation Matrix

In [None]:
# Correlation Matrix
hF, hA = plt.subplots(figsize = (20, 20))
dfXX = dfX.copy()
dfXX['Label'] = pd.to_numeric(dfXX['Label'])
mC = dfXX.corr(method = 'pearson')
sns.heatmap(mC.abs(), cmap = 'coolwarm', annot = True, ax = hA)

#### Predictive Power Score (PPS)

Correlation makes sense for linear regression models.  
But for non linear operation or classification PPS is superior.

Pay attention it is not symmetric, as it tries to weigh the ability of a feature to contribute to estimation which isn't a syymetric operation.  

> Think about estimating the zip code form addres vs. estimating the address from the zip code.

In [None]:
# Feature PPS - Which features are important?

# Pay attention, cross validation is K-Fold -> Don't over split the data
mPPS = pps.matrix(dfX, **{'cross_validation': numCrossValPps, 'random_seed': seedNum})[['x', 'y', 'ppscore']].pivot(columns = 'x', index = 'y', values = 'ppscore') #<! We should set `Label` as a categorial variable

# Visualization of PPS
hF, hA = plt.subplots(figsize = (20, 20))
sns.heatmap(mPPS, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, vmin = 0, vmax = 1, ax = hA) 

plt.setp(hA.get_xticklabels(), ha = "center", rotation = 45)
plt.setp(hA.get_yticklabels(), rotation = 'horizontal')
hA.set_title('Predictive Power Score (PPS)')

## Pre Processing the Data

In [None]:
# Normalization (SciKit's Learn automatically reduce the mean)
mX -= mX.mean(0)
mX /= mX.std (0)

## Applying Dimensionality Reduction - PCA 

In [None]:
# Applying the PCA model
mZ = PCA(n_components = 2).fit_transform(mX)

### Plot the 2D Result

In [None]:
PlotScatterData(mZ, vY)

## MNIST Image Data Set




In [None]:
# MNIST 
mX, vY  = fetch_openml('mnist_784', version = 1, return_X_y = True, as_frame = False)
vSize   = (28, 28)

# Eigen Faces
# mX, vY = fetch_olivetti_faces(return_X_y = True)
# vSize = (64, 64)


mX     /= 255
numSamples, numDims = mX.shape


mX.shape, vY.shape

In [None]:
numImgDis = 6

hF, hA = plt.subplots(1, numImgDis, figsize = (15, 3))
for kk in range(numImgDis):
    idx = np.random.choice(numSamples)
    mI  = np.reshape(mX[idx, :], vSize)
    
    hA[kk].imshow(mI.clip(0, 1), cmap = 'gray')
    hA[kk].set_title(f'Image Index = {idx}')
    
plt.show()

### PCA Mode

In [None]:
oPCA = PCA(n_components = numDims).fit(mX) #<! Basically calculates the model vectors

#### Mean Image

In [None]:
vMean = oPCA.mean_
mI    = np.reshape(vMean, vSize)

plt.figure(figsize = (2, 2))
plt.imshow(mI, cmap = 'gray')
plt.title('Mean')
plt.show()

#### Spectrum

In [None]:
#### Spectrum

vλ = oPCA.explained_variance_ratio_

plt.figure(figsize = (18, 6))
plt.stem(np.sqrt(vλ[:200]), markerfmt = 'b.', label = '$\\sqrt{\lambda_i}$')
plt.title('Eigenvalues')
plt.xlabel('$i$')
plt.legend()
plt.show()

In [None]:
# Energy Ratio

vλ = oPCA.explained_variance_ratio_

plt.figure(figsize=(18, 6))
plt.stem(vλ, markerfmt = 'b.', label = '$Ratio$')
plt.title('Variance Ratio')
plt.xlabel('$Somponent Index$')
plt.legend()
plt.show()

#### Plot the Basis / Components

In [None]:
mU = oPCA.components_ # mU.shape = (n_components, n_features)

fig, _ = plt.subplots(2, 5, figsize = (12, 6))
vIdx   = list(range(5)) + list(range(numDims - 5, numDims))
for kk, ax in zip(range(10), fig.axes):
    idx = vIdx[kk]
    mI  = np.reshape(mU[idx], vSize)
    ax.imshow(mI)
    ax.set_title(f'{OrdinalNum(idx + 1)} Principal Component')
    
plt.tight_layout()
plt.show()

#### Plot Reconstruction

* Encode:
$$\boldsymbol{z}_{i}=\boldsymbol{U}_{d}^{T}\left(\boldsymbol{x}_{i}-\boldsymbol{\mu}_{x}\right)$$  
* Decode:
$$\hat{\boldsymbol{x}}_{i}=\boldsymbol{U}_{d}\boldsymbol{z}_{i}+\boldsymbol{\mu}_{x}$$

In [None]:
hPlotPcaReconstruction = lambda dataIdx, numComponents: PlotPcaReconstruction(mX, dataIdx, mU, vMean, numComponents, vSize, figSize = (14, 4))
dataIdxSlider = IntSlider(min = 0, max = numSamples, step = 1, value = 0, layout = Layout(width = '30%'))
numComponentsSlider = IntSlider(min = 0, max = numDims, step = 1, value = 0, layout = Layout(width = '30%'))

interact(hPlotPcaReconstruction, dataIdx = dataIdxSlider, numComponents = numComponentsSlider)