[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# Clustering - Density Based Spatial Clustering of Applications with Noise (DBSCAN)

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.1.000 | 01/10/2022 | Royi Avital | First version                                                      |
|         |            |             |                                                                    |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/IntroductionMachineLearningSystemEngineers/ClusteringDBSCAN.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures

# Misc
import datetime
import os
from platform import python_version
import random
import warnings
import yaml

# Typing
from typing import Tuple

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image, display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

In [None]:
# Configuration
#%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# sns.set_theme() #>! Apply SeaBorn theme
sns.set_palette("tab10")

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2


In [None]:
# Fixel Algorithms Packages


In [None]:
# Parameters

# Data Generation
vNumSamples = [250, 250, 50]

# Model


# Data Visualization
gridSclae = 5
numGridPts = 250

In [None]:
# Auxiliary Functions

def GenRotMatrix( θ: float ) -> np.ndarray:
    thetaAng = np.radians(θ) #<! Convert Degrees -> Radians
    cosVal, sinVal = np.cos(thetaAng), np.sin(thetaAng)

    mR = np.array([[cosVal, -sinVal], [sinVal, cosVal]])

    return mR

def PlotScatterData(mX: np.ndarray, vL: np.ndarray, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = MARKER_SIZE_DEF, lineWidth: int = LINE_WIDTH_DEF, axisTitle: str = None):

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()
    
    vU = np.unique(vL)
    numClusters = len(vU)

    for ii in range(numClusters):
        vIdx = vL == vU[ii]
        hA.scatter(mX[vIdx, 0], mX[vIdx, 1], s = ELM_SIZE_DEF, edgecolor = EDGE_COLOR, label = ii)
    
    hA.set_xlabel('${{x}}_{{1}}$')
    hA.set_ylabel('${{x}}_{{2}}$')
    if axisTitle is not None:
        hA.set_title(axisTitle)
    hA.grid()
    hA.legend()

    # return hF


def PlotDbscan( mX: np.ndarray, rVal:float, minSamples: int, metricMethod: str, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, markerSize: int = MARKER_SIZE_DEF ):

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()

    vL = DBSCAN(eps = rVal, min_samples = minSamples, metric = metricMethod).fit_predict(mX)
    numClusters = vL.max() + 1

    vIdxC = vL > -1 #<! Clusters
    vIdxN = vL == -1 #<! Noise

    vC = np.unique(vL[vIdxC])
    for ii in range(numClusters):
        vIdx = vL == ii
        hA.scatter(mX[vIdx, 0], mX[vIdx, 1], s = ELM_SIZE_DEF, edgecolor = EDGE_COLOR, label = f'{ii}')
    
    hA.scatter(mX[vIdxN, 0], mX[vIdxN, 1], s = 2 * ELM_SIZE_DEF, edgecolor = 'r', label = 'Noise')

    # hA.scatter(mX[vIdxC, 0], mX[:, 1], s = ELM_SIZE_DEF, c = vL[vIdxC], edgecolor = EDGE_COLOR)
    # hA.scatter(mX[vIdxN, 0], mX[:, 1], s = ELM_SIZE_DEF, c = vL[vIdxN], edgecolor = EDGE_COLOR)
    # hS = hA.scatter(mX[:, 0], mX[:, 1], s = ELM_SIZE_DEF, c = vL, edgecolor = EDGE_COLOR)
    hA.set_xlabel('${{x}}_{{1}}$')
    hA.set_ylabel('${{x}}_{{2}}$')
    hA.set_title(f'DBSCAN Clustering, Number of Clusters: {numClusters}, Number of Noise Labels: {np.sum(vIdxN)}')
    hA.legend()


## Generate Data


In [None]:
# Generate Data

mX0, _ = make_moons(vNumSamples[0], noise = .05)
mX1, _ = make_moons(vNumSamples[1], noise = .05)
mX1    = mX1 * [1, -1] + [0, 3]
mX2    = np.random.rand(vNumSamples[2], 2) * [4, 5] - [1.75, 2/3]
mX     = np.r_[mX0, mX1, mX2]
vL     = np.repeat(range(len(vNumSamples)), vNumSamples)


### Plot Data

In [None]:
PlotScatterData(mX, vL)

## Cluster Data by DBSCAN


In [None]:
# There are two parameters to the algorithm, `min_samples` and `eps`, which define formally what we mean when we say dense.
# Higher `min_samples` or lower `eps` indicate higher density necessary to form a cluster.

hPlotDbscan = lambda rVal, minSamples, metricMethod: PlotDbscan(mX, rVal, minSamples, metricMethod, figSize = (12, 12))
rSlider = FloatSlider(min = 0.05, max = .5, step = 0.05, value = 0.05, layout = Layout(width = '30%'))
zSlider = IntSlider(min = 1, max = 10, step = 1, value = 3, layout = Layout(width = '30%'))
metricMethodDropdown = Dropdown(description = 'Metric Method', options = [('Cityblock', 'cityblock'), ('Cosine', 'cosine'), ('Euclidean', 'euclidean')], value = 'euclidean')
interact(hPlotDbscan, rVal = rSlider, minSamples = zSlider, metricMethod = metricMethodDropdown)