# [HW 13] Neural Tangent Kernels

## Important Notes:
* The larger neural networks take significant amounts of memory to train. You will have to **run on Google [colab](http://colab.research.google.com/github/BerkeleyML/cs189-notebooks/hw13/prob4/prob4.ipynb) or your local machine for this notebook. Datahub will not work.**
* In order to prevent duplicate images the animation generator and display code has to be broken into two cells. **You must re-run the upper cell before re-running the lower cell to get a new animation.**
* Feel free to change the number of training samples as you work through the cells and comment on how the behavior changes with more or fewer samples.
* You can also try switching between `nn.ReLU`, `nn.Tanh`, and any other activations you want to try. The only exception is the infinite-width deterministic kernel, for which we've only provided you with the ReLU activation version.
* On a recent MacBook Pro, most cells take around 1 minute to run and the multiple networks cell takes 3-4 minutes. You may want to reduce the number of training steps while debugging your NTK implementation for faster iteration. Don't forget to reset the number of steps for your final submission.


### Import and Helpers


In [None]:
import copy
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from typing import List
from IPython.display import HTML
from matplotlib.patches import Ellipse
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['font.size'] = 16
plt.rcParams['animation.embed_limit'] = 50  # 50 MB for longer animations
plt.rcParams['axes.labelweight'] = 'bold'
colors = list(plt.cm.tab10.colors)
np.random.seed(1234)
torch.manual_seed(1234);


In [None]:
def to_tensor(x):
    return torch.from_numpy(x).float()


def to_numpy(x):
    return x.detach().cpu().numpy()


# Dataset: N (x,y) pairs
def gen_dataset(N):
    x = np.random.rand(N) * 2 - 1
    x = np.vstack([x, np.ones(N)]).T
    x_t = to_tensor(x)
    # y = np.random.normal(size=2)
    y = 1 * np.sin(np.pi * x[:,0])
    y_t = to_tensor(y).reshape(-1, 1)
    return x, y, x_t, y_t


# Training loop for GD on the neural networks
def train_gd(ntk, x, y, epochs, lr):
    opt = torch.optim.SGD(ntk.parameters(), lr=lr)
    losses = np.zeros(epochs)
    for e in range(epochs):
        opt.zero_grad()
        loss = torch.mean((y - ntk(x)) ** 2)
        loss.backward()
        opt.step()
        losses[e] = loss.item()
    return losses


def infinite_kernel(X, depth):
    """
    Calculate the infinite-width NTK for a network with 'depth' layers
    and ReLU nonlinearities.

    Adapted from https://github.com/LeoYu/neural-tangent-kernel-UCI.
    See https://arxiv.org/pdf/1904.11955.pdf for a full derivation.
    """
    K = np.zeros((depth, X.shape[0], X.shape[0]))
    S = X @ X.T
    H = np.zeros_like(S)
    for d in range(depth):
        H += S
        K[d] = H
        L = np.diag(S)
        P = np.clip(np.sqrt(np.outer(L, L)), a_min = 1e-9, a_max = None)
        Sn = np.clip(S / P, a_min = -1, a_max = 1)
        S = (Sn * (np.pi - np.arccos(Sn)) + np.sqrt(1.0 - Sn * Sn)) * P / 2.0 / np.pi
        H = H * (np.pi - np.arccos(Sn)) / 2.0 / np.pi
    return K[depth - 1]


# (h.1) NTK Implementation

Since we expect the change in parameters during training to be small for sufficiently large networks, we can approximate the gradient flow during training with a 1st order Taylor expansion. Letting $w$ be a vector of all the trainable parameters in our network, we have

$$ y(\mathbf{w}) = y(\mathbf{w}_0) + \nabla_{\mathbf{w}_0} f(x; \mathbf{w}_0) (\mathbf{w} - \mathbf{w}_0). $$

Using this linearized version of the network we can then learn $\mathbf{\beta} = (\mathbf{w} - \mathbf{w}_0)$ using the linear regression or classification method of our choice. For this problem we will use the min-norm least squares regression solution, with $\Phi(x) = \nabla_\mathbf{w} f(x; \mathbf{w}_0)$ as the kernel featurization and $y(\mathbf{w}) - y(\mathbf{w}_0)$ as the target.

In the cell below we have given you the infrastructure for a neural network and it's neural tangent kernel implementation. Note that this network includes a scaling parameter $\alpha$ which allows smaller changes in the parameters $\mathbf{w}$ to produce similar changes in the network function $f(x; \mathbf{w})$. This helps smaller networks behave similarly to the NTK. You can see how $\alpha$ is used in the `forward` method of the `NTK` class. You will also need to use the `self.layers0` member which contains a copy of the network parameters at initialization.

**Implement the following:**
* **The kernel prediction $\hat{y} = y(\mathbf(w)_0) + \nabla_{\mathbf{w}_0} f(x) (\mathbf{w} - \mathbf{w}_0)$**
* **The kernel Gram matrix $K = \Phi(x) \Phi(x)^T$**
* **The min-norm least squares solution for $\mathbf{\beta} = (\mathbf{w} - \mathbf{w}_0)$**

This is the only coding you will need to do for this problem.


In [None]:
# Linear layer initialized with N(0,1) weights and biases, with sqrt(1/dL) normalization
# to ensure constant output variance with growing width.
class NTKLinearLayer(nn.Linear):
    def __init__(self, in_features, out_features, bias=False, beta=0.1):
        self.beta = beta
        super(NTKLinearLayer, self).__init__(in_features, out_features, bias=bias)
        self.reinitialize()

    def reinitialize(self):
        nn.init.normal_(self.weight, mean=0, std=1)
        if self.bias is not None:
            nn.init.normal_(self.bias, mean=0, std=1)

    def forward(self, x):
        return F.linear(x, self.weight / np.sqrt(self.in_features / 2), None if self.bias is None else self.bias * self.beta)

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}, beta={}'.format(
            self.in_features, self.out_features, self.bias is not None, self.beta
        )


class NTK(nn.Module):
    def __init__(self, hidden_layers: List[int], activation, alpha=1, bias_beta=0.1):
        """
        hidden_layers: list of ints specifying layer widths
        activation: an activation from nn.{ReLU,Tanh,...} to use as the hidden layer activation
        alpha: scaling on the output
        beta: scaling on biases to reduce their influence during training
        """
        super(NTK, self).__init__()
        self.alpha = alpha
        self.bias_beta = bias_beta
        self.hidden_layers = hidden_layers
        layer_sizes = [2] + hidden_layers + [1]
        self.layers = nn.Sequential(
            NTKLinearLayer(layer_sizes[0], layer_sizes[1], bias=False, beta=bias_beta),
            *[layer for i in range(1, len(layer_sizes)-1)
              for layer in (activation(),
                            NTKLinearLayer(layer_sizes[i], layer_sizes[i+1], bias=False, beta=bias_beta))
            ]
        )
        # Save a copy of the initial network
        self.layers0 = nn.Sequential(
            NTKLinearLayer(layer_sizes[0], layer_sizes[1], bias=False, beta=bias_beta),
            *[layer for i in range(1, len(layer_sizes)-1)
              for layer in (activation(),
                            NTKLinearLayer(layer_sizes[i], layer_sizes[i+1], bias=False, beta=bias_beta))
            ]
        )
        self.layers0.load_state_dict(self.layers.state_dict())
        self.w0 = nn.utils.parameters_to_vector(self.parameters()).detach()
        self.kernel = None
        self.Phi = None
        self.beta = None

    def forward(self, x):
        return self.alpha * self.layers(x)

    def forward_kernel(self, x):
        assert self.kernel is not None, "Kernel must be initialized before calling forward_kernel"
        assert self.beta is not None, "MNLS solution must be found before calling forward_kernel"
        # Get features for test x
        out = self(x)
        m, _ = out.shape
        phi = torch.zeros(m, self.nparams(), requires_grad=False)
        for i in range(m):
            self.zero_grad()
            out[i].backward(retain_graph=True)
            p_grad = torch.tensor([], requires_grad=False)
            for p in self.parameters():
                p_grad = torch.cat((p_grad, p.grad.reshape(-1)))
            phi[i, :] = p_grad
        with torch.no_grad():
            # TODO: Compute and return the kernel prediction y_hat
            ### start forward_kernel ###

            ### end forward_kernel ###

    def nparams(self):
        nparams = 0
        for l in self.layers:
            if isinstance(l, nn.Linear):
                nparams += l.weight.numel()
                if l.bias is not None:
                    nparams += l.bias.numel()
        return nparams

    def reinitialize(self):
        for l in self.layers:
            if isinstance(l, nn.Linear):
                l.reinitialize()
        self.w0 = nn.utils.parameters_to_vector(self.parameters()).detach()
        self.layers0.load_state_dict(self.layers.state_dict())
        self.kernel = None
        self.Phi = None
        self.beta = None

    def compute_kernel_mnls(self, X, y=None):
        """
        Calculate the neural tangent kernel of the model on the inputs
        and the min-norm least squares coefficients for this kernel (if
        y is provided).
        """
        # Forward pass on X so we can get gradients
        out = self(X)
        p = self.nparams()
        n, outdim = out.shape
        assert outdim == 1, "Output dimension must be 1"

        # This is the transposed Jacobian (grad y(w))^T)
        Phi = torch.zeros(n, p, requires_grad=False)
        for i in range(n):
            # Find gradient vector induced by this data point
            self.zero_grad()
            out[i].backward(retain_graph=True)
            p_grad = torch.tensor([], requires_grad=False)
            for p in self.layers.parameters():
                p_grad = torch.cat((p_grad, p.grad.reshape(-1)))
            Phi[i, :] = p_grad

        self.Phi = Phi
        # TODO: Compute the tangent kernel Gram matrix K
        ### start compute_gram ###

        ### end compute_gram ###
        self.kernel = K

        # If y is provided, compute the MNLS weights
        if y is not None:
            # TODO: Compute the MNLS regression weights for the neural tangent kernel
            # Hint: Be sure to properly account for y(w0)
            ### start beta ###

            ### end beta ###
            self.beta = beta

    def norm_diff_relative(self):
        w = nn.utils.parameters_to_vector(self.parameters())
        return torch.norm(w - self.w0) / torch.norm(self.w0)

    def parameters(self):
        return self.layers.parameters()


# (h.2) Evolution of Hidden Weights

The plots below demonstrate the behavior of increasingly large networks during training with gradient descent.

The first animation visualizes the weight matrix between the hidden layers as training progresses. You should see that the smaller networks' weights vary more widely, while the larger networks look almost constant.


In [None]:
%%capture
x, y, x_t, y_t = gen_dataset(5)
ntks = [NTK([h,h], nn.ReLU, alpha=1) for h in [5, 10, 50, 100]]
fig, axes = plt.subplots(2,2,figsize=[16,16]);
imgs = np.array([[axes[j,i].matshow(
    ntks[i+j*2].layers[2].weight.detach(), vmin=-1.5, vmax=1.5)
                  for j in range(2)] for i in range(2)])

fig.subplots_adjust(right=0.85)
cbar_ax = fig.add_axes([0.9, 0.15, 0.025, 0.7])
fig.colorbar(imgs[0][0], cax=cbar_ax)

def animate(i, epochs=1):
    for i in range(2):
        for j in range(2):
            train_gd(ntks[i+j*2], x_t, y_t, epochs=epochs, lr=1)
            imgs[i,j].set_array(ntks[i+j*2].layers[2].weight.detach())
    return imgs.flatten()

ani = animation.FuncAnimation(
    fig, animate, fargs=(10,), interval=30, blit=True, save_count=100);


In [None]:
t0 = time.time()
HTML(ani.to_jshtml())
t1 = time.time()
print("Rendered in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))


## Parameter Change Relative Norm

The plots below show the relative size of the change in parameters during training, along with the training loss.

In the space provided, **comment on the relationship between the relative change in weights and the training loss. Compare across the network sizes and across any stages you observe during training for a particular network.**


In [None]:
x, y, x_t, y_t = gen_dataset(5)
updates = 2000
epochs_per_update = 1
sizes = [5, 10, 50, 100]
ntks = [NTK([h,h], nn.ReLU, alpha=1) for h in sizes]
norm_diffs = np.zeros((updates+1, len(ntks)))
train_losses = np.zeros((updates, len(ntks)))
t0 = time.time()
for u in range(updates):
    for n, ntk in enumerate(ntks):
        losses = train_gd(ntk, x_t, y_t, epochs=epochs_per_update, lr=.02)
        norm_diffs[u+1, n] = ntk.norm_diff_relative()
        train_losses[u, n] = losses[-1]
t1 = time.time()
print("Trained in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))

plt.figure(figsize=[10,12])
plt.subplot(2,1,1)
for i in range(len(ntks)):
    plt.plot(np.arange(updates+1) * epochs_per_update, norm_diffs[:, i], label="hidden_size = {}".format(sizes[i]))
plt.legend()
plt.xlabel("Epochs")
plt.ylabel(r"$\frac{\|w - w_0\|}{\|w_0\|}$", fontsize=24, fontweight='bold')

plt.subplot(2,1,2)
for i in range(len(ntks)):
    plt.plot(np.arange(updates) * epochs_per_update, train_losses[:, i], label="hidden_size = {}".format(sizes[i]))
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Train Loss");


In [None]:
### start relative_norm_sol ###

### end relative_norm_sol ###


# (h.3) Single Net GD Training vs NTK Least Squares Regression

The animation below shows the NTK regression function and the training process of gradient descent for a relatively large network. In the space provided, **comment on the difference between the NTK regression function and the function learned by gradient descent once it is mostly converged.**


In [None]:
%%capture
N = 5
x, y, x_t, y_t = gen_dataset(N)
x_test = np.vstack([np.linspace(-1,1,100), np.ones(100)]).T
x_test_t = to_tensor(x_test)
# Initialize network and finite width kernel
ntk = NTK([500,500,500], nn.ReLU, alpha=1)
ntk.compute_kernel_mnls(x_t, y_t)
# Calculate and use infinite width kernel
Kinf = infinite_kernel(np.vstack([x, x_test]), 4)
Ktrain = Kinf[:N, :N]
Ktest_train = Kinf[N:, :N]
y0 = to_numpy(ntk(x_t))
yinf = to_numpy(ntk(x_test_t)) + Ktest_train @ np.linalg.pinv(Ktrain, hermitian=True, rcond=1e-9) @ (y.reshape(-1, 1) - y0)

# Plotting
fig, ax = plt.subplots()
plt.plot(x_test[:,0], to_numpy(ntk.forward_kernel(x_test_t)), linewidth=2, label="Kernel MNLS")
plt.plot(x_test[:,0], yinf, linewidth=2, label="$\infty$ Width Kernel MNLS", c='k')
line, = plt.plot(x_test[:,0], to_numpy(ntk(x_test_t)), label="GD")
plt.scatter(x[:,0], y, label="Training Points")
plt.legend()
plt.xlim([-1, 1])
plt.ylim(-1.5, 1.5)
plt.xlabel("$x$")
plt.ylabel("$y$")

def animate(i):
    if i == 0:
        return line,
    train_gd(ntk, x_t, y_t, epochs=i, lr=.01)
    line.set_ydata(to_numpy(ntk(x_test_t)))
    return line,

ani = animation.FuncAnimation(
    fig, animate, interval=30, blit=True, save_count=200)


In [None]:
t0 = time.time()
HTML(ani.to_jshtml())
t1 = time.time()
print("Rendered in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))


In [None]:
### start single_net_gd_ntk ###

### end single_net_gd_ntk ###


# (h.4) Multiple Net GD Training vs NTK Least Squares Regression

The animation below shows the NTK regression function and the training process of gradient descent for many relatively large networks. In the space provided, **comment on the variance in learned functions between the NTK function, between the GD trained functions, and across the two groups. Compare them to the infinite width deterministic kernel. Try running more than once to see if the behavior is consistent.**

Dotted lines represent the NTK regression functions. Solid lines represent the GD trained regression functions.


In [None]:
%%capture
N = 5
x, y, x_t, y_t = gen_dataset(N)
n_ntks = 10
x_test = np.vstack([np.linspace(-1,1,100), np.ones(100)]).T
x_test_t = to_tensor(x_test)

fig, ax = plt.subplots()
ntks = [NTK([100,100,100], nn.ReLU, alpha=1) for _ in range(n_ntks)]
label = True
ntk_line = None
for i, ntk in enumerate(ntks):
    ntk.compute_kernel_mnls(x_t, y_t)
    if label:
        ntk_line, = ax.plot(x_test[:,0], to_numpy(ntk.forward_kernel(x_test_t)), ':',
                            label="NTK regression", c=colors[i])
        label = False
    else:
        ax.plot(x_test[:,0], to_numpy(ntk.forward_kernel(x_test_t)), ':', c=colors[i])

# Calculate and use infinite width kernel
Kinf = infinite_kernel(np.vstack([x, x_test]), 4)
Ktrain = Kinf[:N, :N]
Ktest_train = Kinf[N:, :N]
y0 = to_numpy(ntk(x_t))
yinf = to_numpy(ntk(x_test_t)) + Ktest_train @ np.linalg.pinv(Ktrain, hermitian=True, rcond=1e-9) @ (y.reshape(-1, 1) - y0)
linf, = plt.plot(x_test[:,0], yinf, linewidth=2, label="$\infty$ Width Kernel MNLS", c='k')

lines = []
for i in range(n_ntks):
    if i == 0:
        line, = ax.plot(x_test[:,0], to_numpy(ntks[i](x_test_t)), label="GD regression", c=colors[i])
    else:
        line, = ax.plot(x_test[:,0], to_numpy(ntks[i](x_test_t)), c=colors[i])
    lines.append(line)
sc = ax.scatter(x[:,0], y, label="Training Points")
plt.xlim([-1, 1])
plt.ylim(-1.5, 1.5)
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.legend(handles=[ntk_line, lines[0], linf, sc])

def animate(i):
    if i > 0:
        for ntk in ntks:
            train_gd(ntk, x_t, y_t, epochs=i, lr=.01)
    for i in range(len(lines)):
        lines[i].set_ydata(to_numpy(ntks[i](x_test_t)))
    return lines

ani = animation.FuncAnimation(
    fig, animate, interval=50, blit=True, save_count=200);


In [None]:
t0 = time.time()
HTML(ani.to_jshtml())
t1 = time.time()
print("Rendered in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))


In [None]:
### start mult_net_gd_ntk ###

### end mult_net_gd_ntk ###


# (h.5) Kernel Ellipses and Convergence

For the animation below we will work with only 2 training samples, which allows us to easily visualize the $\mathbb{R}^{2\times2}$ kernel Gram matrix. The animation below shows an ellipse determined by the eigenvalues and eigenvectors of the kernel matrix for the network _determined at that point in training_ and a history of the predictions during training. The axes are $\hat{y}_1$ and $\hat{y}_2$, the predictions given by the network for the two training points $x_1$ and $x_2$. The ellipses are centered at $(y_1, y_2)$ to show the desired outputs of the network. The colors of the ellipses are matched to the output history scatter points.

In the space provided, **comment on the relationship between the convergence behavior of $\hat{y}_1$ and $\hat{y}_2$ and the kernel ellipses. Also comment on the stability of the kernel matrix during training for larger  and smaller networks.**


In [None]:
def kernel_to_angle(A):
    l, Q = np.linalg.eigh(A)
    q1 = Q[0]
    ang = np.arctan2(q1[1], q1[0]) * 180/np.pi
    return 1/l[0], 1/l[1], ang


def update_ellipse(e, w, h, ang):
    e.width = w
    e.height = h
    e.angle = ang


In [None]:
%%capture
x, y, x_t, y_t = gen_dataset(2)
print(x,y)

updates = 150
center = y_t
c = 1
fig, ax = plt.subplots()

sizes = [5,10,50,100,500]
ntks = [NTK([h, h, h], nn.ReLU, alpha=1) for h in sizes]
ellipses = []
yhats = []
scs = []
for i, ntk in enumerate(ntks):
    ntk.compute_kernel_mnls(x_t, y_t)
    K = ntk.kernel
    w, h, angle = kernel_to_angle(K)
    ell = Ellipse(center, w*c, h*c, angle, fill=False, color=colors[i], linewidth=3, alpha=0.7)
    ellipses.append(ell)
    ax.add_patch(ell)

    yhat = torch.zeros((updates, 2))
    yhat[0, :] = ntk(x_t).detach().T
    yhat[:, 0] = yhat[0, 0]
    yhat[:, 1] = yhat[0, 1]
    yhats.append(yhat)
    sc = ax.scatter(yhat[:,0], yhat[:,1], color=colors[i], label='Size {}'.format(sizes[i]))
    scs.append(sc)

ax.set_xlim([-2, 2])
ax.set_ylim([-2, 2])
ax.legend(loc='upper left');
ax.set_xlabel("$\hat{y}_1$")
ax.set_ylabel("$\hat{y}_2$")

def animate(i, epochs):
    if i == 0:
        return scs + ellipses
    for n, ntk in enumerate(ntks):
        train_gd(ntk, x_t, y_t, epochs=epochs, lr=.05)
        ntk.compute_kernel_mnls(x_t, y_t)
        K = ntk.kernel
        w, h, a = kernel_to_angle(K)
        update_ellipse(ellipses[n], w*c, h*c, a)
        yhats[n][i, :] = ntk(x_t).detach().T
        scs[n].set_offsets(yhats[n])
    return scs + ellipses


ani = animation.FuncAnimation(
    fig, animate, fargs=(1,), interval=50, blit=True, save_count=updates);


In [None]:
t0 = time.time()
HTML(ani.to_jshtml())
t1 = time.time()
print("Rendered in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))


In [None]:
### start ellipse_convergence ###

### end ellipse_convergence ###


# (h.6) Infinite Width Deterministic Kernel

It turns out that as the width of the hidden layers goes to infinity, the neural tangent kernel becomes deterministic. We can calculate it based on the number of hidden layers and the activation function, as you see in the function `infinite_kernel` below.

The following animation demonstrates the convergence of kernels from networks of increasing size, shown on the left, to the deterministic infinite width kernel on the right. In the space provided, **comment on the differences between the kernel for small networks and the infinite width kernel, and how they converge.**


In [None]:
%%capture
x, y, x_t, y_t = gen_dataset(5)

Kinf = infinite_kernel(x, 3)
Kmin = np.min(Kinf)
Kmax = np.max(Kinf)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))
img_inf = axes[1].imshow(Kinf, vmin=Kmin, vmax=Kmax)
axes[1].set_title("Infinite-width Kernel")

fig.subplots_adjust(right=0.85)
cbar_ax = fig.add_axes([0.9, 0.15, 0.025, 0.7])
fig.colorbar(img_inf, cax=cbar_ax)

def animate(i, widths):
    d = widths[i]
    ntk = NTK([d,d], nn.ReLU, alpha=1)
    ntk.compute_kernel_mnls(x_t, y_t)
    img = axes[0].imshow(ntk.kernel / 2 ** 2, vmin=Kmin, vmax=Kmax)
    title = axes[0].set_title("Width={} Kernel".format(d))
    return img, title

widths = [10,50,100,500,1000]
ani = animation.FuncAnimation(
    fig, animate, fargs=(widths,), interval=500, blit=True, save_count=len(widths));


In [None]:
t0 = time.time()
HTML(ani.to_jshtml())
t1 = time.time()
print("Rendered in {} minutes {:.1f} seconds".format(int(np.floor((t1 - t0) / 60)), (t1 - t0) % 60))


In [None]:
### start infinite_width ###

### end infinite_width ###


# (h.7) Eigenvalues of Kernel Gram Matrices

**Compare the eigenvalues and conditioning of the Gram matrices across the varying widths.**


In [None]:
x, y, x_t, y_t = gen_dataset(25)

widths = [10,50,100,500,1000]
for d in widths:
    ntk = NTK([d]*3, nn.ReLU, alpha=1)
    ntk.compute_kernel_mnls(x_t, y_t)
    eigs, _ = np.linalg.eigh(ntk.kernel / 2**3)
    plt.plot(sorted(eigs)[::-1], 'o-', label='d=%d' % d)

Kinf = infinite_kernel(x, 4)
eigs, _ = np.linalg.eigh(Kinf)
plt.plot(sorted(eigs)[::-1], 'o--', label='d=$\infty$', linewidth=3, c='k')

plt.legend()
plt.yscale('log')
plt.show();


In [None]:
### start conditioning ###

### end conditioning ###
