In [1]:
#######################
# standard code block #
#######################

# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline

# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']


#######################
#       imports       #
#######################
import pandas as pd
import seaborn as sns
# import sklearn

sns.set_style("whitegrid")

Populating the interactive namespace from numpy and matplotlib


# One last thing

- SVMs have an interesting history that we haven't quite had the time to discuss in detail. 
- In a nutshell, 
    - SVMs were one of the most popular off-the-shelf models during the late 1990s and early 2000s.
    - Two recent trends have led SVMs to be less used:
        - The success of Deep Learning for nonlinear classification problems.
        - The use of ever-bigger datasets.
        
Let's zero in on this last point and then discuss the how and why SVMs can remain useful today.

## Big Data Woes

SVMs run into two problems with big data


- First, **Test complexity**
    - SVM test complexity depends on the kernel but is generally $O(n_\text{sv}d)$ where $n_\text{sv}$ is the number of support vectors and $d$ is the number of dimensions. <sup> [1](https://arxiv.org/pdf/1403.0736.pdf) </sup>
    - In the worst case, all training examples are chosen as support vectors and SVM has the same test complexity as other non-parametric models like KNN: $O(nd)$. 
    - This is bad. Ideally more training data will always make our model better, but in this case it slows SVMs down.

- Second, the **train complexity**
    - For linear SVMs, the train complexity is between $O(n^2)$ and $O(n^3)$. <sup>[2](https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf)</sup>
    - For kernels-SVMs, train complexity depends on many factors but shares a lower bound at $O(n^2)$ and can often be much worse in practice. <sup>[2](https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf)</sup>
    

In [3]:
from sklearn import metrics

pal = dict(enumerate(sns.color_palette("husl", 4)))

def plot_decision_boundary(pred_func, x, y, ax=None, points=1e3, pal=pal, margin_func=None):

    if ax is None:
        fig, ax = subplots()
    
    y_pred = pred_func(x)
    score = metrics.accuracy_score(y_pred.flatten(), y.flatten())

    sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, alpha=.5, edgecolor=None, palette=pal, ax=ax)

    side_pts = int(sqrt(points))

    x0_min, x0_max = ax.get_xlim()
    x1_min, x1_max = ax.get_ylim()
    xx, yy = np.meshgrid(
        np.linspace(x0_min, x0_max, num=side_pts),
        np.linspace(x1_min, x1_max, num=side_pts))

    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    

    ax.text(
        (x0_min + x0_max) / 2,
        (x1_min + x1_max) / 2,
        f"acc: {score:.1%}",
        bbox=dict(boxstyle="round", fc="white", ec="black"))
    

    ax.contourf(xx, yy, Z, alpha=0.2, colors=list(pal.values()), zorder=-1)
    
    if not (margin_func is None): 
        Z = margin_func(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        
        # plot decision boundary and margins
        ax.contour(xx, yy, Z, colors='k', levels=[-1, 1], alpha=0.5,
                   linestyles=['--','--'], zorder=0)

In [4]:
from ipywidgets import interact, interactive, HBox, VBox, interactive_output, widgets

from IPython.display import display

In [5]:
# dataset generation from https://scikit-learn.org/stable/auto_examples/cluster/plot_linkage_comparison.html#sphx-glr-auto-examples-cluster-plot-linkage-comparison-py
from sklearn import datasets

n_samples = 500
noisy_circles = datasets.make_circles(
    n_samples=n_samples, factor=.5, noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, centers=2, random_state=0)

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, centers=2, random_state=0)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples,
    centers=2,
    cluster_std=[.7, 2.5],
    random_state=random_state)

datasets_str = ["noisy_circles", "noisy_moons", "blobs", "aniso", "varied"]
kernels = ['linear', 'poly', 'rbf']

In [6]:
from sklearn import svm

def kernel_widget(d_name, k_name, degree=3, c=1.0, gamma=.5):

    x, y = eval(d_name)

    svm_model = svm.SVC(
        kernel=k_name, gamma=gamma, degree=degree, C=c, cache_size=1000, max_iter=1000)
    svm_model.fit(x, y)

    plot_decision_boundary(
        svm_model.predict,
        x,
        y,
        points=1e4,
        margin_func=svm_model.decision_function)

    sv_scatter = plt.scatter(
        svm_model.support_vectors_[:, 0],
        svm_model.support_vectors_[:, 1],
        marker="x",
        c="black",
        zorder=-1,
        label="sv")
    
    n_sv = svm_model.support_vectors_.shape[0]
    
    plt.title(f"{k_name}-svm on {d_name}. d={degree}, c={c:g}, gamma={gamma:g}. {n_sv} support vectors" )

    plt.legend()

In [7]:
d_name_w = widgets.RadioButtons(
    options=datasets_str, description='Dataset:', disabled=False)
k_name_w = widgets.RadioButtons(
    options=kernels, description='Kernel:', disabled=False)
degree_w = widgets.IntSlider(
    value=3,
    min=0,
    max=10,
    step=1,
    description='Degree: ',
    continuous_update=False)
c_w = widgets.FloatLogSlider(
    value=1,
    base=10,
    min=-2,  # max exponent of base
    max=6,  # min exponent of base
    step=0.5,  # exponent step
    description='C: ',
    continuous_update=False)
gamma_w = widgets.FloatLogSlider(
    value=.5,
    base=2,
    min=-5,  # max exponent of base
    max=6,  # min exponent of base
    step=0.5,  # exponent step
    description='Gamma: ',
    continuous_update=False)

ui = HBox([
    VBox([d_name_w]),
    VBox([k_name_w]),
    VBox([degree_w, c_w, gamma_w])
])

In [8]:
out = interactive_output(kernel_widget, {"d_name":d_name_w,
                                   "k_name": k_name_w, 
                                   "degree":degree_w, 
                                   "c": c_w, 
                                   "gamma":gamma_w})

# Addressing Test Complexity

- If the main problem with test complexity is the number of support vectors, we can try to tackle that directly.
    - One important strategy here is to tune the model so that it learns fewer support vectors.

Note: You can also directly limit the number of support vectors with [`NuSVC`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html)

# Exercise: Tuning can reduce the number of support vectors

Let's revisit the previous widget. This time, see if you can determine how to tune the model to reduce the number of support vectors.

In [10]:
display(ui, out)

HBox(children=(VBox(children=(RadioButtons(description='Dataset:', options=('noisy_circles', 'noisy_moons', 'b…

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<Figure size 432x288 with 1 Axes>', 'i…

# Solution:

For all kernels: 
  - Increasing $C$ tends to reduce the number of support vectors.
  - Reducing training error (by finding a kernel or parameters that better capture the data) tends to reduce the number of support vectors.
    
For poly and RBF:
  - Increasing `gamma` tends to reduce the number of support vectors.

# Addressing Train Complexity

# Stochastic Gradient Descent for SVMs

- Recall from the [SGD lesson](../../project-02/stochastic-gradient-descent) that training a model with SGD has an impressive time complexity: In the limit, the time to train a model with SGD is $O(1)$ in terms of the number of examples. 💁‍
- When dealing with very large datasets, it's a good idea to train with SGD wherever possible. 


- Can we train SVMs with SGD?
    - We *can* train SVMs with SGD,
    - but there's *one* problem: this only works for **linear kernels**. 

# Enter Kernel approximation

## But first a kernel trick review

As we discussed before, the kernel trick implicitly calculates a product $\phi(\mathbf{x})^\top\phi(\mathbf{x}')$ between two transformed sets of examples.

$$K(\mathbf{x}, \mathbf{x}') = \phi(\mathbf{x})^\top\phi(\mathbf{x}')$$

Without the kernel trick, if we wish to transform our data into a high-degree polynomial space, we would need to explicitly transform each point using $\phi(\mathbf{x})$ and then we would need to multiply the transformed examples $\phi(\mathbf{x})^\top\phi(\mathbf{x}')$. The kernel trick allows us to arrive at the result without first performing the transformation. 

## Kernel approximation

Kernel approximation is another method for efficiently using kernels. Instead we find an approximation $\phi_a(\mathbf{x}) \sim \phi(\mathbf{x})$ and use that to *explicitly* transform the examples before we feed them to a linear SVM.

Finally, we reproduce the above widgets, showing a comparison between the kernel trick and kernel approximation.

In [11]:
from sklearn import kernel_approximation, linear_model, pipeline

def approx_widget(d_name, k_name, degree=3, c=1.0, gamma=.5, n_components=100):

    fig, axes = plt.subplots(ncols=2, figsize=(10, 5))

    x, y = eval(d_name)

    svm_model = svm.SVC(
        kernel=k_name,
        gamma=gamma,
        degree=degree,
        C=c,
        cache_size=1000,
        max_iter=1000)
    svm_model.fit(x, y)

    svm_sgd = linear_model.SGDClassifier(max_iter=500, tol=1e-3)
    if d_name != "linear":
        svm_sgd = pipeline.make_pipeline(
            kernel_approximation.Nystroem(
                kernel=k_name,
                gamma=gamma,
                degree=degree,
                n_components=n_components), svm_sgd)
        
    svm_sgd.fit(x, y)

    plot_decision_boundary(
        svm_model.predict,
        x,
        y,
        ax=axes[0],
        points=1e4,
        margin_func=svm_model.decision_function)
    sv_scatter = axes[0].scatter(
        svm_model.support_vectors_[:, 0],
        svm_model.support_vectors_[:, 1],
        marker="x",
        c="black",
        zorder=-1,
        label="sv")
    n_sv = svm_model.support_vectors_.shape[0]
    axes[0].set_title(f"Exact SVM. {n_sv} support vectors")
    axes[0].legend()

    plot_decision_boundary(
        svm_sgd.predict,
        x,
        y,
        ax=axes[1],
        points=1e4,
        margin_func=svm_sgd.decision_function)
    axes[1].set_title(f"SGD SVM. {n_components} components")

    fig.suptitle(
        f"{k_name}-svm on {d_name}. d={degree}, c={c:g}, gamma={gamma:g}.")

In [12]:
d_name_w = widgets.RadioButtons(
    options=datasets_str, description='Dataset:', disabled=False)
k_name_w = widgets.RadioButtons(
    options=kernels, description='Kernel:', disabled=False)
degree_w = widgets.IntSlider(
    value=3,
    min=0,
    max=10,
    step=1,
    description='Degree: ',
    continuous_update=False)
c_w = widgets.FloatLogSlider(
    value=1,
    base=10,
    min=-2,  # max exponent of base
    max=6,  # min exponent of base
    step=0.5,  # exponent step
    description='C: ',
    continuous_update=False)
gamma_w = widgets.FloatLogSlider(
    value=.5,
    base=2,
    min=-5,  # max exponent of base
    max=6,  # min exponent of base
    step=0.5,  # exponent step
    description='Gamma: ',
    continuous_update=False)
n_components_w = widgets.IntSlider(
    value=100,
    min=50,  # max exponent of base
    max=500,  # min exponent of base
    step=50,  # exponent step
    description='N Components: ',
    continuous_update=False)
ui = HBox([
    VBox([d_name_w]),
    VBox([k_name_w]),
    VBox([degree_w, c_w, gamma_w, n_components_w])
])

# Exercise

Confirm that SVMs with the with SGD and kernel approximation are similar to kernel SVMs. 

- What differences do you notice?

In [13]:
out = interactive_output(approx_widget, {"d_name":d_name_w,
                                   "k_name": k_name_w, 
                                   "degree":degree_w, 
                                   "c": c_w, 
                                   "gamma":gamma_w,
                                   "n_components": n_components_w})
display(ui, out)

HBox(children=(VBox(children=(RadioButtons(description='Dataset:', options=('noisy_circles', 'noisy_moons', 'b…

Output()

# Solution

- What differences do you notice?
    - There are no support vectors. `SGDClassifier` learns a weight matrix according to the weight-based linear function: $f(\mathbf{x}) = \mathbf{w}^\top \mathbf{x} + B$.
    - The hyperparameter `C` doesn't affect the SGD SVM. This is because `SGDClassifier` uses a separate parameter `alpha` which, depending on the kernel, is roughly $\alpha \sim \frac{1}{c}$.
    - Accuracy tends to increase as the number of components increases. However, be careful because the time to train also increases with the number of components. 

# Main points

 
- When using SVMs on big data we may
    - Tune hyperparameters to reduce the number of support vectors.
    - Use SGD along with kernel approximation to efficiently train on large datasets.