# [HW 4] Kernel Ridge Regression Practice


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm


In [None]:
# Make a result directory to store plots
os.makedirs("./result", exist_ok=True)


In [None]:
def lstsq(A, b, lambda_=0):
    return np.linalg.solve(A.T @ A + lambda_ * np.eye(A.shape[1]), A.T @ b)


def heatmap(f, clip=True):
    # example: heatmap(lambda x, y: x * x + y * y)
    xx = yy = np.linspace(np.min(X), np.max(X), 72)
    x0, y0 = np.meshgrid(xx, yy)
    x0, y0 = x0.ravel(), y0.ravel()
    z0 = f(x0, y0)

    if clip:
        z0[z0 > 5] = 5
        z0[z0 < -5] = -5

    plt.hexbin(x0, y0, C=z0, gridsize=50, cmap=cm.jet, bins=None)
    plt.colorbar()
    cs = plt.contour(
        xx, yy, z0.reshape(xx.size, yy.size), [-2, -1, -0.5, 0, 0.5, 1, 2], cmap=cm.jet)
    plt.clabel(cs, inline=1, fontsize=10)

    pos = y[:] == +1.0
    neg = y[:] == -1.0
    plt.scatter(X[pos, 0], X[pos, 1], c='red', marker='+')
    plt.scatter(X[neg, 0], X[neg, 1], c='blue', marker='v')


In [None]:
data_names = ['circle', 'heart', 'asymmetric']


## (a) Visualize the Datasets

**Visualize all the datasets.
Label the points with different $y$ values with different colors and/or shapes.**


In [None]:
# data = np.load('circle.npz')
# data = np.load('heart.npz')
data = np.load('asymmetric.npz')

X = data["x"]
y = data["y"]

for i, dataset in enumerate(data_names):
    data = np.load(dataset + '.npz')
    X = data["x"]
    y = data["y"]

    plt.figure(figsize=[6,12])
    plt.subplot(3,1,i+1)
    ### start viz_data ###

    ### end viz_data ###
    plt.legend()
    plt.title(dataset)
plt.show();


## (b) Polynomial Regression (Non-kernel)

**Implement polynomial ridge regression** to
fit the datasets `circle.npz`, `asymmetric.npy`, and
`heart.npz`.  Use the first 80% of the data as the training dataset and the
last 20% of the data as the validation dataset.
**Report both the average
training squared loss and the average validation squared for polynomial order
$p \in \{1, \dots, 16\}$.  Use the regularization term $\lambda=0.001$ for all
$p$.  Visualize your result and attach the heatmap plots for the
learned predictions over the entire 2D domain for $p \in \{2, 4, 6, 8, 10,
12\}$ in your writeup.**


In [None]:
# data = np.load('circle.npz')
data = np.load('heart.npz')
# data = np.load('asymmetric.npz')

SPLIT = 0.8
X = data["x"]
y = data["y"]
X /= np.max(X)  # normalize the data

n_train = int(X.shape[0] * SPLIT)
X_train = X[:n_train:, :]
X_valid = X[n_train:, :]
y_train = y[:n_train]
y_valid = y[n_train:]


In [None]:
def assemble_feature(x, D):
    """Create a vector of polynomial features up to order D from x"""
    ### start poly_features ###

    ### end poly_features ###
    return poly_x


LAMBDA = 0.001
isubplot = 0
plt.figure(figsize=[12,10])
for D in range(1, 17):
    ### start poly_nonkernel ###

    ### end poly_nonkernel ###
    if D in [2, 4, 6, 8, 10, 12]:
        isubplot += 1
        plt.subplot(3,2,isubplot)
        heatmap(lambda x, y: assemble_feature(np.vstack([x, y]).T, D) @ w)
        plt.title("D = %d" % D)
    print("p = {:2d}   train_error = {:10.6f}  validation_error = {:10.6f}  cond = {:14.6f}".
            format(D, error_train, error_valid, cond))
plt.show();


## (c) Polynomial Kernel Ridge Regression

**Implement kernel ridge regression** to fit the datasets
`circle.npz`, `heart.npz`, and optionally (due to the
computational requirements), `asymmetric.npz`. Use the polynomial
kernel $K(\vec x_i, \vec x_j) = (1 + \vec x_i^\top \vec x_j)^p$. Use the first
80\% data as the training dataset and the last 20\% data as the validation
dataset.  **Report both the average training squared loss and the average
validation squared loss for polynomial order $p \in \{1,\dots, 16\}$.  Use the
regularization term $\lambda=0.001$ for all $p$. For
`circle.npz`, also report the average training squared loss and
validation squared loss for polynomial order $p \in \{1,\dots, 24\}$ when you
use only the first 15\% of data as the training dataset and the final 85\% of data as
the validation dataset.  Based on the error, comment on when you want
to use a high-order polynomial in linear/ridge regression.**


In [None]:
# data = np.load('circle.npz')
data = np.load('heart.npz')
# data = np.load('asymmetric.npz')

SPLIT = 0.8
X = data["x"]
y = data["y"]
X /= np.max(X)  # normalize the data

n_train = int(X.shape[0] * SPLIT)
X_train = X[:n_train:, :]
X_valid = X[n_train:, :]
y_train = y[:n_train]
y_valid = y[n_train:]


In [None]:
def poly_kernel(X, XT, D):
    """Create the polynomial order D kernel matrix from X and X^T"""
    ### start poly_kernel_helper ###

    ### end poly_kernel_helper ###
    return K

isubplot = 0
plt.figure(figsize=[12,10])
for D in range(1, 16):
    ### start poly_kernel ###

    ### end poly_kernel ###
plt.show();


_Your comments here..._


## (d) Diminishing Influence of the Prior with Growing Data

With increasing of amount of data, the gains from regularization diminish.
**Sample a subset of training data from the first 80\% of data from
\texttt{asymmetric.npz} and use the data from the last 20\% of data for
validation. Make a plot whose $x$ axis is the amount of the training
data and $y$ axis is the validation squared loss of the non-kernelized ridge
regression algorithm.
Include 6 curves for hyper-parameters $\lambda \in \{0.0001,
0.001, 0.01\}$ and $p = \{5, 6\}$.**
Your plot should demonstrate that with same
$p$, the validation squared loss will converge with enough data, regardless of
the choice of $\lambda$.  You can use log scaling on the
$x$ axis for clarity, and you need to resample the data multiple times for a
given $p$, $\lambda$, and amount of training data in order to get a smooth curve.


In [None]:
data = np.load('asymmetric.npz')

SPLIT = 0.8
X = data["x"]
y = data["y"]
X /= np.max(X)  # normalize the data

n_train = int(X.shape[0] * SPLIT)
X_train = X[:n_train:, :]
X_valid = X[n_train:, :]
y_train = y[:n_train]
y_valid = y[n_train:]


In [None]:
### start vanishing_prior ###

### end vanishing_prior ###


## (e) RBF Kernel Ridge Regression

A popular kernel function that is widely used in various kernelized
learning algorithms is called the radial basis function kernel (RBF kernel).
It is defined as
\begin{equation} K(\mathbf{x}, \mathbf{x}') = \exp \left(-\frac{\lVert
\mathbf{x}-\mathbf{x}'\rVert_2^2}{2\sigma^2}\right).
\end{equation}
**Implement the RBF kernel function for kernel ridge regression to fit the dataset
`heart.npz`.  Use the regularization term $\lambda=0.001$.
Report the average squared loss, visualize your result and attach the
heatmap plots for the fitted functions over the 2D domain for $\sigma \in \{10,
3, 1, 0.3, 0.1, 0.03\}$ in your writeup.**
You may want to vectorize your kernel
functions to speed up your implementation. **Comment on the effect of
$\sigma$.**


In [None]:
# data = np.load('circle.npz')
data = np.load('heart.npz')
# data = np.load('asymmetric.npz')

SPLIT = 0.8
X = data["x"]
y = data["y"]
X /= np.max(X)  # normalize the data

n_train = int(X.shape[0] * SPLIT)
X_train = X[:n_train:, :]
X_valid = X[n_train:, :]
y_train = y[:n_train]
y_valid = y[n_train:]


In [None]:
def rbf_kernel(X, XT, sigma):
    ### start rbf_kernel_helper ###

    ### end rbf_kernel_helper ###
    return K

plt.figure(figsize=[12,10])
isubplot = 0
for sigma in [10, 3, 1, 0.3, 0.1, 0.03]:
    ### start rbf_kernel ###

    ### end rbf_kernel ###
plt.show();


_Your comments here..._


## (g) Experimentation

**Disable the `clip` option in the provided `heatmap`
function and redraw the heatmap plots for the functions learned by the
polynomial kernel and RBF kernel.  Experiment on the provided datasets and
describe one potential problem of the polynomial kernel related to what
you see here. Does the RBF kernel have such problem?  Compute,
compare, comment, and attach the heatmap plots of the polynomial kernel and the
RBF kernel on `heart.npz` dataset.**


In [None]:
data = np.load('heart.npz')

SPLIT = 0.8
X = data["x"]
y = data["y"]
X /= np.max(X)  # normalize the data

n_train = int(X.shape[0] * SPLIT)
X_train = X[:n_train:, :]
X_valid = X[n_train:, :]
y_train = y[:n_train]
y_valid = y[n_train:]


In [None]:
# Your code here...
