# Dis12 - Decision Boundary Visualization on Decision Tree, Random Forest, and Adaboost


In this discussion, we will first visualize the decision boundaries of different decision tree-based models, including basic decision tree, random tree, and Adaboost.


In [None]:
import matplotlib.pyplot as plt
import seaborn
import matplotlib.patches as patches
%matplotlib inline
seaborn.set(font_scale=2)
seaborn.set_style("white")

from sklearn.preprocessing import normalize
import numpy as np
import ipywidgets as widgets
from ipywidgets import interactive


### Generate Data

We first generate data points with label $y\in \{+1, -1\}$ that also have all the $x_i$ from different classes being $r^{\prime}$ apart, i.e.,

$$\| x_i - x_j \|_{2} > r^{\prime}, \quad \text{for}\, y_i \neq y_j.$$


In [None]:
def cal_radius(x):
    return np.sqrt(x[0] ** 2 + x[1] ** 2)


# random points at least 2r apart
m = 200
np.random.seed(221)
x_train = [np.random.uniform(size=(2))]

r = 0.05
epsilon = r/2

while(len(x_train) < m):
    p = np.random.uniform(size=(2))
    if min(cal_radius(p-a) for a in x_train) > 1.0*r:
        if np.abs(p[0] - p[1]) > r:
            x_train.append(p)

X_train = np.array(x_train)
y_train = ((X_train[:, 1] - X_train[:, 0] > 0) * 2.0) - 1.0


**Visualize the data points.**


In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap="coolwarm", s=70)
ax.axis("equal")
ax.axis([0,1,0,1])
ax.plot([0, 1.0], [0.0 + r, 1.0 + r], '--', color='black')
ax.plot([0, 1.0], [0.0 - r, 1.0 - r], '--', color='black')


**Setup the visualization functions.**


In [None]:
def visualize_decision_boundary(clf, X, y, x, depth=None, num_trees=None):
    '''Visualize the decision boundaries of classifiers'''
    XX, YY = np.meshgrid(np.linspace(0, 1, 500), np.linspace(0, 1, 500))
    X0 = np.stack([np.ravel(XX), np.ravel(YY)]).T
    y0 = clf.predict(X0)
    ZZ = y0.reshape(500,500)

    fig, ax = plt.subplots(figsize=(8, 8))
    if num_trees == None:
        plt.title('Decision Boundary, depth={}'.format(depth), pad=20)
    else:
        plt.title('Decision Boundary, depth={}, number of trees={}'.format(depth, num_trees), pad=20)
    ax.contourf(XX,YY,ZZ, cmap="coolwarm", levels=np.linspace(-1000,1000,3))
    ax.scatter(X[:,0], X[:,1], c=y, cmap="coolwarm", s=70)
    ax.plot([0, 1.0], [0.0 + r, 1.0 + r], '--', color='black')
    ax.plot([0, 1.0], [0.0 - r, 1.0 - r], '--', color='black')
    ax.axis("equal")
    ax.axis([0,1,0,1])


In [None]:
def visualize_decision_box(clf, X, y, x, depth=None):
    '''Visualize the decision boundaries as well as the boxes of decision tree'''
    XX, YY = np.meshgrid(np.linspace(0, 1, 1000), np.linspace(0, 1, 1000))
    X0 = np.stack([np.ravel(XX), np.ravel(YY)]).T
    y0 = clf.apply(X0)
    ZZ = y0.reshape(1000,1000)

    fig, ax = plt.subplots(figsize=(8, 8))
    plt.title('Boxes of Decision Tree, depth={}'.format(depth), pad=20)
    ax.contour(XX,YY,ZZ, levels=np.unique(y0), colors='k')
    ax.scatter(X[:,0], X[:,1], c=y, cmap="coolwarm", s=20)
    ax.plot([0, 1.0], [0.0 + r, 1.0 + r], '--', color='black')
    ax.plot([0, 1.0], [0.0 - r, 1.0 - r], '--', color='black')

    XX, YY = np.meshgrid(np.linspace(0, 1, 500), np.linspace(0, 1, 500))
    X0 = np.stack([np.ravel(XX), np.ravel(YY)]).T
    y0 = clf.predict(X0)
    ZZ = y0.reshape(500,500)
    ax.contourf(XX,YY,ZZ, cmap="coolwarm", levels=np.linspace(-1000,1000,3))
    ax.scatter(X[:,0], X[:,1], c=y, cmap="coolwarm", s=70)
    ax.axis("equal")
    ax.axis([0,1,0,1])


## Part (I). Visualize in the 2d case


**Decision tree:**

We first visualize the decision boundary of a standard decision tree. Besides the decision boundary, we also visualize the which leaf nodes the points belong to in this 2d case. **Since each leaf node corresponds to a 2d box**.


In [None]:
def generate_tree_depth():
    return widgets.IntSlider(
        value=5,
        min=1,
        max=10,
        step=1,
        description='depth of decision tree',
        continuous_update=False)

def generate_tree_num():
    return widgets.IntSlider(
        value=20,
        min=1,
        max=500,
        step=1,
        description='number of trees',
        continuous_update=False)

def generate_max_samples():
    return widgets.IntSlider(
        value=20,
        min=1,
        max=200,
        step=1,
        description='max samples',
        continuous_update=False)


In [None]:
from sklearn.tree import DecisionTreeClassifier

def visualize_dt(depth_tree):
    clf = DecisionTreeClassifier(max_depth=depth_tree)
    clf = clf.fit(X_train, y_train)
    visualize_decision_box(clf, X_train, y_train, x_train, depth_tree)
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth_tree)

interactive_plot = interactive(visualize_dt, depth_tree=generate_tree_depth())
interactive_plot


**Random forest:**

Question: Why happens to the boundary shape when you increase the number of `n_estimators`?


In [None]:
from sklearn.ensemble import RandomForestClassifier

def visualize_rf(max_depth, max_samples, n_estimators):
    clf = RandomForestClassifier(max_depth=max_depth, max_samples=max_samples, random_state=0, n_estimators=n_estimators)
    clf = clf.fit(X_train, y_train)
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth=max_depth, num_trees=n_estimators)

interactive_plot = interactive(visualize_rf,
                               max_depth=generate_tree_depth(),
                               max_samples=generate_max_samples(),
                               n_estimators=generate_tree_num())
interactive_plot


**Adaboost:**


In [None]:
from sklearn.ensemble import AdaBoostClassifier
def visualize_adaboost(max_depth, n_estimators):
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth),
                             n_estimators=n_estimators,
                             random_state=None,
                             algorithm='SAMME')
    clf.fit(X_train, y_train)
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth=max_depth, num_trees=n_estimators)

interactive_plot = interactive(visualize_adaboost,
                               max_depth=generate_tree_depth(),
                               n_estimators=generate_tree_num())
interactive_plot


## Part (II). Visualize in the 2d case we have label noise in training data.

In this part, we first random select a subset of the training sample and flip their labels. Then we visualize how the above three methods behavior under label noise.


In [None]:
np.random.seed(189)

y_train_noise = y_train * 1.0
random_idx = np.random.choice(m, int(len(y_train)*0.05), replace=False)
y_train_noise[random_idx] = y_train[random_idx] * -1.0


In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(X_train[:,0], X_train[:,1], c=y_train_noise, cmap="coolwarm", s=70)
ax.axis("equal")
ax.axis([0,1,0,1])
ax.plot([0, 1.0], [0.0 + r, 1.0 + r], '--', color='black')
ax.plot([0, 1.0], [0.0 - r, 1.0 - r], '--', color='black')


**Decision tree:**

Change the depth of the tree and check whether you could achieve zero training error.


In [None]:
from sklearn.tree import DecisionTreeClassifier

def visualize_dt(depth_tree):
    clf = DecisionTreeClassifier(max_depth=depth_tree)
    clf = clf.fit(X_train, y_train_noise)
    print('training error: ', 1.0 - clf.score(X_train, y_train_noise))
    visualize_decision_box(clf, X_train, y_train, x_train, depth_tree)
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth_tree)

interactive_plot = interactive(visualize_dt, depth_tree=generate_tree_depth())
interactive_plot


**Random forest:**

Question: Set the number of trees to be large, why the behaviour of random forest is still reasonably good under label noise?


In [None]:
from sklearn.ensemble import RandomForestClassifier

def visualize_rf(max_depth, max_samples, n_estimators):
    clf = RandomForestClassifier(max_depth=max_depth, max_samples=max_samples, random_state=0, n_estimators=n_estimators)
    clf = clf.fit(X_train, y_train_noise)
    print('training error: ', 1.0 - clf.score(X_train, y_train_noise))
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth=max_depth, num_trees=n_estimators)

interactive_plot = interactive(visualize_rf,
                               max_depth=generate_tree_depth(),
                               max_samples=generate_max_samples(),
                               n_estimators=generate_tree_num())
interactive_plot


**Adaboost:**

Does Adaboost behave better than decision tree under label noise? In what sense it behave better? Evaluate the training error of the adaboost model.


In [None]:
from sklearn.ensemble import AdaBoostClassifier
def visualize_adaboost(max_depth, n_estimators):
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth),
                             n_estimators=n_estimators,
                             random_state=None,
                             algorithm='SAMME')
    clf.fit(X_train, y_train_noise)
    print('training error: ', 1.0 - clf.score(X_train, y_train_noise))
    visualize_decision_boundary(clf, X_train, y_train, x_train, depth=max_depth, num_trees=n_estimators)

interactive_plot = interactive(visualize_adaboost,
                               max_depth=generate_tree_depth(),
                               n_estimators=generate_tree_num())
interactive_plot


**Congrats! Hope you get a better understanding of the tree-based methods!**
