In this notebook we test our implementation of bayesian regression. We build a gradient-descent version and compare it to the sk-learn result to see if it works.

In [1]:
# General imports
import numpy as np
import torch

from deepymod.data import Dataset
from deepymod.data.burgers import BurgersDelta
from sklearn.linear_model import BayesianRidge

# Making data

In [32]:
# Making dataset
v = 0.1
A = 1.0

x = np.linspace(-3, 4, 100)
t = np.linspace(0.5, 5.0, 50)
x_grid, t_grid = np.meshgrid(x, t, indexing='ij')
dataset = Dataset(BurgersDelta, v=v, A=A)

y = dataset.time_deriv(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1)) # observations
X = dataset.library(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1), poly_order=2, deriv_order=3) # covariates

print(y.shape, X.shape)

(5000, 1) (5000, 12)


In [33]:
y += np.std(y) * 0.1 * np.random.randn(*y.shape)

# Baseline

In [34]:
regressor = BayesianRidge(fit_intercept=False, compute_score=True, alpha_1=0, alpha_2=0, lambda_1=0, lambda_2=0)
regressor.fit(X, y.squeeze())

BayesianRidge(alpha_1=0, alpha_2=0, compute_score=True, fit_intercept=False,
              lambda_1=0, lambda_2=0)

In [35]:
baseline_coeffs = regressor.coef_[:, None]
print(baseline_coeffs)

[[ 9.25461693e-05]
 [-1.25340470e-05]
 [ 9.71209677e-02]
 [-3.28153305e-04]
 [-9.23843579e-03]
 [-9.99175470e-01]
 [ 1.63508980e-02]
 [ 8.37122857e-05]
 [ 2.98737937e-02]
 [-1.68085495e-02]
 [-9.96449651e-03]
 [ 1.80301488e-04]]


In [36]:
baseline_noise_precision = regressor.alpha_
print(baseline_noise_precision)

3834.331271382523


In [37]:
baseline_prior_precision = regressor.lambda_
print(baseline_prior_precision)

11.876898543214669


In [38]:
regressor.scores_

array([ 4466.14650269, 13465.67622217, 13465.68313968, 13465.68315486])

# Own implementation

In [93]:
X = torch.tensor(X)
y = torch.tensor(y)

  X = torch.tensor(X)
  y = torch.tensor(y)


In [94]:
alpha_ = torch.tensor(baseline_noise_precision)
lambda_ = torch.tensor(baseline_prior_precision)

N = X.shape[0]
M = X.shape[1]

In [95]:
Sn = torch.inverse(lambda_ * torch.eye(M) + alpha_ * X.T @ X)
mn = alpha_ * Sn @ X.T @ y

In [96]:
print(mn)

tensor([[-1.8028e-15],
        [-5.9827e-14],
        [ 1.0000e-01],
        [ 2.4078e-15],
        [ 1.1935e-14],
        [-1.0000e+00],
        [-1.2396e-13],
        [ 3.3003e-15],
        [-7.8382e-14],
        [-4.2205e-13],
        [ 1.0732e-13],
        [-7.6553e-15]], dtype=torch.float64)


This matches the SK learn values, so thats correct. Now to calculate the neg LL

In [119]:
%%time
mu_post = X @ mn
sigma_post = alpha_ * torch.eye(N) + X @ Sn @ X.T 
L = torch.inverse(sigma_post)

CPU times: user 18.9 s, sys: 804 ms, total: 19.7 s
Wall time: 1.23 s


In [101]:
log_p = 1/2 * (-torch.trace(torch.log(L)) - (y - mu_post).T @ L @ (y - mu_post) - N * np.log(2*np.pi))

In [102]:
print(log_p)

tensor([[158446.8950]], dtype=torch.float64)


Seems close enough to the sklearn implementation, although we can make many things much more efficient (e.g. use woodbury inversion etc). Now let's try and do gradient descent:

In [106]:
alpha_ = torch.nn.Parameter(1/torch.var(y))
lambda_ = torch.nn.Parameter(torch.ones(1))

X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]

  X = torch.tensor(X)
  y = torch.tensor(y)


In [117]:
optimizer = torch.optim.Adam([alpha_, lambda_], lr=1.0)
max_epochs = 1e4

In [118]:
for epoch in torch.arange(max_epochs):
    Sn = torch.inverse(lambda_ * torch.eye(M) + alpha_ * X.T @ X)
    mn = alpha_ * Sn @ X.T @ y
    
    mu_post = X @ mn
    sigma_post = alpha_ * torch.eye(N) + X @ Sn @ X.T 
    L = torch.inverse(sigma_post)

    log_p = 1/2 * (-torch.trace(torch.log(L)) - (y - mu_post).T @ L @ (y - mu_post) - N * np.log(2*np.pi))
    loss = -log_p
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(log_p)

tensor([[4565.8581]], dtype=torch.float64, grad_fn=<MulBackward0>)


KeyboardInterrupt: 

# Own implementation, efficient:

In [130]:
alpha_ = torch.tensor(baseline_noise_precision)
lambda_ = torch.tensor(baseline_prior_precision)

N = X.shape[0]
M = X.shape[1]

Sn = lambda_ * torch.eye(M) + alpha_ * X.T @ X
mn = alpha_ * torch.inverse(Sn) @ X.T @ y

In [133]:
E = alpha_ / 2 * torch.sum((y - X @ mn)**2) + lambda_ / 2 * mn.T @ mn
log_p = M / 2 * torch.log(lambda_) + N / 2 * torch.log(alpha_) - E - 1/2 * torch.trace(torch.log(Sn)) - N / 2 * np.log(2*np.pi)

In [134]:
alpha_ = torch.nn.Parameter(1/torch.var(y))
lambda_ = torch.nn.Parameter(torch.ones(1))

X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]

  X = torch.tensor(X)
  y = torch.tensor(y)


In [142]:
optimizer = torch.optim.Adam([alpha_, lambda_])
max_epochs = 1e5

In [143]:
for epoch in torch.arange(max_epochs):
    A = lambda_ * torch.eye(M) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ / 2 * torch.sum((y - X @ mn)**2) + lambda_ / 2 * mn.T @ mn
    log_p = M / 2 * torch.log(lambda_) + N / 2 * torch.log(alpha_) - E - 1/2 * torch.trace(torch.log(Sn)) - N / 2 * np.log(2*np.pi)
    loss = -log_p
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e4 == 0:
        print(log_p)

tensor([[18950.4037]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18952.1147]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18953.8237]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18955.5307]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18957.2377]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18958.9428]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18960.6479]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18962.3509]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18964.0522]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[18965.7534]], dtype=torch.float64, grad_fn=<SubBackward0>)


In [146]:
alpha_

Parameter containing:
tensor(14711.6165, dtype=torch.float64, requires_grad=True)

In [147]:
lambda_

Parameter containing:
tensor([11.9059], requires_grad=True)

Cool, it works. Now let's optimize for big and small values:

In [None]:
X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M))

In [None]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [None]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    alpha_ = torch.exp(a)
    
    A = lambda_ * torch.eye(M) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + lambda_ * mn.T @ mn
    loss = E + torch.sum(torch.log(torch.diag(A))) - (M * l + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

In [88]:
mn

tensor([[ 2.8449e-16],
        [ 1.0295e-13],
        [ 1.0000e-01],
        [-2.8892e-15],
        [ 9.5757e-15],
        [-1.0000e+00],
        [ 1.1977e-13],
        [-5.0975e-15],
        [ 4.0412e-14],
        [ 6.7740e-13],
        [-1.1495e-13],
        [ 1.0516e-14]], dtype=torch.float64, grad_fn=<MmBackward>)

In [89]:
lambda_

tensor([11.8797], grad_fn=<ExpBackward>)

In [90]:
alpha_

tensor(2.8472e+83, dtype=torch.float64, grad_fn=<ExpBackward>)

In [91]:
a

Parameter containing:
tensor(192.1610, dtype=torch.float64, requires_grad=True)

# Sparse bayesian learning

In [57]:
X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M, dtype=torch.float64))

  X = torch.tensor(X)
  y = torch.tensor(y)


In [58]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [60]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    alpha_ = torch.exp(a)
    
    A = torch.diag(lambda_) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = E + torch.sum(torch.log(torch.diag(A))) - (torch.sum(l) + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

tensor([[-18096.6639]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36230.7532]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36230.7534]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36230.8253]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36230.8216]], dtype=torch.float64, grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [87]:
X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M, dtype=torch.float64))
threshold = 1e4

  X = torch.tensor(X)
  y = torch.tensor(y)


In [88]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [96]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    alpha_ = torch.exp(a)
    
    A = torch.diag(lambda_) + alpha_ * X.T @ X
    mn = (lambda_ < threshold)[:, None] * (alpha_ * torch.inverse(A) @ X.T @ y)
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = E + torch.sum(torch.log(torch.diag(A)[lambda_ < threshold])) - (torch.sum(l[lambda_ < threshold]) + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

tensor([[-35959.0172]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36067.3525]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36066.7911]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36074.7667]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36068.2743]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36065.5063]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36067.4777]], dtype=torch.float64, grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [99]:
torch.diag(A)

tensor([19441732.2587,   403313.2337], dtype=torch.float64,
       grad_fn=<IndexBackward>)

In [97]:
mn

tensor([[-0.0000],
        [ 0.0000],
        [ 0.1001],
        [-0.0000],
        [ 0.0000],
        [-0.9970],
        [ 0.0000],
        [-0.0000],
        [ 0.0000],
        [-0.0000],
        [ 0.0000],
        [ 0.0000]], dtype=torch.float64, grad_fn=<MulBackward0>)

In [95]:
l[lambda_ < threshold]

tensor([4.6035, 0.0060], dtype=torch.float64, grad_fn=<IndexBackward>)