[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# Classifier - The Decision Tree Classifier

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.1.000 | 20/09/2022 | Royi Avital | First version                                                      |
|         |            |             |                                                                    |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/IntroductionMachineLearningSystemEngineers/ClassifierDecisionTree.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.datasets import load_breast_cancer, make_circles
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

from scipy.spatial.distance import cdist

# Misc
import datetime
import os
from platform import python_version
import random
import warnings
import yaml

# Typing
from typing import Tuple

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image, display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

In [None]:
# Configuration
%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF = (8, 8)
ELM_SIZE_DEF = 50
CLASS_COLOR = ('b', 'r')
EDGE_COLOR  = 'k'


In [None]:
# Fixel Algorithms Packages


In [None]:
# Parameters

# Data Generation
numSamples = 500
noiseLevel = 0.01

numSamples0 = 500
numSamples1 = 500

testSize = 0.5

maxSplits = 75

# Data Visuzalization
numGridPts = 250

In [None]:
# Auxiliary Functions

def PlotBinaryClassData( mX: np.ndarray, vY: np.ndarray, hA:plt.Axes = None, figSize: Tuple[int, int] = FIG_SIZE_DEF, elmSize: int = ELM_SIZE_DEF, classColor: Tuple[str, str] = CLASS_COLOR, axisTitle: str = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = figSize)
    else:
        hF = hA.get_figure()
    
    vC, vN = np.unique(vY, return_counts = True)

    numClass = len(vC)
    if (len(vC) != 2):
        raise ValueError(f'The input data is not binary, the number of classes is: {numClass}')

    vIdx0 = vY == vC[0]
    vIdx1 = vY == vC[1] #<! Basically ~vIdx0

    hA.scatter(mX[vIdx0, 0], mX[vIdx0, 1], s = elmSize, color = classColor[0], edgecolor = 'k', label = f'$C_\u007b {vC[0]} \u007d$')
    hA.scatter(mX[vIdx1, 0], mX[vIdx1, 1], s = elmSize, color = classColor[1], edgecolor = 'k', label = f'$C_\u007b {vC[1]} \u007d$')
    hA.axvline(x = 0, color = 'k')
    hA.axhline(y = 0, color = 'k')
    hA.axis('equal')
    if axisTitle is not None:
        hA.set_title(axisTitle)
    hA.legend()
    
    return hA

## Generate Data

In [None]:
# Loading / Generating Data
mX, vY  = make_circles(n_samples = numSamples, noise = noiseLevel)

mX[0, :] = [0,     0.1]
mX[1, :] = [-0.1, -0.1]
mX[2, :] = [0.1,  -0.1]
vY[:3]   = 0

vIdx0 = vY == 0
vIdx1 = vY == 1

mX.shape, vY.shape

### Plot Data

In [None]:
hA = PlotBinaryClassData(mX, vY)

## Train a Decision Tree Classifier

In [None]:
# Grid of the data support
v0       = np.linspace(mX[:,0].min(), mX[:,0].max(), numGridPts)
v1       = np.linspace(mX[:,1].min(), mX[:,1].max(), numGridPts)
XX0, XX1 = np.meshgrid(v0, v1)
XX       = np.c_[XX0.ravel(), XX1.ravel()]

def PlotTree( K ):
    # Train the a Decision Tree classifier
    oTreeClassifier = DecisionTreeClassifier(criterion = 'entropy', max_leaf_nodes = K, random_state = 0)
    oTreeClassifier.fit(mX, vY)

    # Fit to grid data
    Z = oTreeClassifier.predict(XX)
    Z = Z.reshape(XX0.shape)

    # Plot classification
    hF, hA = plt.subplots(1, 2, figsize = (16, 8))
    hA[0].contourf(XX0, XX1, Z, colors = CLASS_COLOR, alpha = 0.3, levels = [-1.5, 0, 1.5])
    hA[0].scatter(mX[vIdx0, 0], mX[vIdx0, 1], s = ELM_SIZE_DEF, color = CLASS_COLOR[0], edgecolor = EDGE_COLOR)
    hA[0].scatter(mX[vIdx1, 0], mX[vIdx1, 1], s = ELM_SIZE_DEF, color = CLASS_COLOR[1], edgecolor = EDGE_COLOR)

    plot_tree(oTreeClassifier, filled = True, ax = hA[1], rounded = True)
    hA[1].set_title(f'Max Leaf Nodes = {K}')
    

In [None]:
# Display the Geometry of the Classifier

kSlider = IntSlider(min = 2, max = 20, step = 1, value = 2, layout = Layout(width = '30%'))
interact(PlotTree, K = kSlider)
# plt.show()

## Train vs. Test (Overfit) as a Function of Degrees of Freedom

### Generate / Load Data

In [None]:
# Loading / Generating Data

mX1 = np.random.randn(numSamples0, 2) + 1
mX2 = np.random.randn(numSamples1, 2) - 1
vY1 = np.ones(numSamples0)
vY2 = np.zeros(numSamples1)
mX  = np.r_[mX1, mX2]
vY  = np.r_[vY1, vY2]

vIdx0 = vY == 0
vIdx1 = vY == 1



In [None]:
# Plot Data
hA = PlotBinaryClassData(mX, vY, axisTitle = 'Train and Test Set')

In [None]:
# Split data into Train and Test Sets

mTrainX, mTestX, vTrainY, vTestY = train_test_split(mX, vY, test_size = testSize)

### Train Decision Trees with Various Depths

In [None]:
lTrees    = []
for kk in range(1, maxSplits + 1):
    oTree = DecisionTreeClassifier(criterion = 'entropy', max_leaf_nodes = kk + 1, random_state = 0)
    oTree.fit(mTrainX, vTrainY)
    lTrees.append(oTree)

In [None]:
vTrainRes = np.array([oTree.score(mTrainX, vTrainY) for oTree in lTrees]) #<! Default score is mean accuracy
vTestRes  = np.array([oTree.score(mTestX,  vTestY)  for oTree in lTrees]) #<! Default score is mean accuracy

v0       = np.linspace(mX[:, 0].min(), mX[:, 0].max(), numGridPts)
v1       = np.linspace(mX[:, 1].min(), mX[:, 1].max(), numGridPts)
XX0, XX1 = np.meshgrid(v0, v1)
XX       = np.c_[XX0.ravel(), XX1.ravel()]

def PlotTreeTrainTest(K):
    Z = lTrees[K-1].predict(XX)
    Z = Z.reshape(XX0.shape)

    hF, hA = plt.subplots(1, 2, figsize = (16, 8))
    hA[0].contourf(XX0, XX1, Z, colors = CLASS_COLOR, alpha = 0.3, levels = [-1.5, 0, 1.5])
    hA[0].scatter(mX[vIdx0, 0], mX[vIdx0, 1], s = ELM_SIZE_DEF, color = CLASS_COLOR[0], edgecolor = EDGE_COLOR)
    hA[0].scatter(mX[vIdx1, 0], mX[vIdx1, 1], s = ELM_SIZE_DEF, color = CLASS_COLOR[1], edgecolor = EDGE_COLOR)

    hA[1].plot(range(1, K + 1), vTrainRes[:K], color = 'm', lw = 2, marker = '.', markersize = 20, label = 'Train Accuracy')
    hA[1].plot(range(1, K + 1), vTestRes[:K], color = 'k', lw = 2, marker = '.', markersize = 20, label = 'Test Accuracy')
    hA[1].set_title(f'Max Splits = {K}')
    hA[1].legend()

In [None]:
# Analysis results

kSlider = IntSlider(min = 1, max = maxSplits, step = 1, value = 1, layout = Layout(width = '30%'))
interact(PlotTreeTrainTest, K = kSlider)
plt.show()