In this notebook we test our implementation of bayesian regression. We build a gradient-descent version and compare it to the sk-learn result to see if it works.

In [19]:
# General imports
import numpy as np
import torch

from deepymod.data import Dataset
from deepymod.data.burgers import BurgersDelta
from sklearn.linear_model import BayesianRidge, ARDRegression

# Making data

In [52]:
# Making dataset
v = 0.1
A = 1.0

x = np.linspace(-3, 4, 100)
t = np.linspace(0.5, 5.0, 50)
x_grid, t_grid = np.meshgrid(x, t, indexing='ij')
dataset = Dataset(BurgersDelta, v=v, A=A)

y = dataset.time_deriv(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1)) # observations
X = dataset.library(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1), poly_order=2, deriv_order=3) # covariates

print(y.shape, X.shape)

(5000, 1) (5000, 12)


In [53]:
y += np.std(y) * 0.1 * np.random.randn(*y.shape)

# Baseline

In [4]:
regressor = BayesianRidge(fit_intercept=False, compute_score=True, alpha_1=0, alpha_2=0, lambda_1=0, lambda_2=0)
regressor.fit(X, y.squeeze())

BayesianRidge(alpha_1=0, alpha_2=0, compute_score=True, fit_intercept=False,
              lambda_1=0, lambda_2=0)

In [5]:
baseline_coeffs = regressor.coef_[:, None]
print(baseline_coeffs)

[[ 3.09726874e-04]
 [-3.50663740e-03]
 [ 1.01144779e-01]
 [ 5.67446914e-05]
 [ 5.40631166e-03]
 [-9.85768085e-01]
 [-7.85455878e-03]
 [ 5.00818886e-04]
 [-2.22369909e-02]
 [ 6.77249951e-03]
 [ 3.22057228e-03]
 [-3.42670155e-04]]


In [6]:
baseline_noise_precision = regressor.alpha_
print(baseline_noise_precision)

3873.863701508867


In [7]:
baseline_prior_precision = regressor.lambda_
print(baseline_prior_precision)

12.200493341819628


In [8]:
regressor.scores_

array([ 4457.83679982, 13491.41861303, 13491.42612999, 13491.42614771])

# Own implementation

In [9]:
X = torch.tensor(X)
y = torch.tensor(y)

In [10]:
alpha_ = torch.tensor(baseline_noise_precision)
lambda_ = torch.tensor(baseline_prior_precision)

N = X.shape[0]
M = X.shape[1]

In [11]:
Sn = torch.inverse(lambda_ * torch.eye(M) + alpha_ * X.T @ X)
mn = alpha_ * Sn @ X.T @ y

In [12]:
print(mn)

tensor([[ 3.0973e-04],
        [-3.5066e-03],
        [ 1.0114e-01],
        [ 5.6745e-05],
        [ 5.4063e-03],
        [-9.8577e-01],
        [-7.8546e-03],
        [ 5.0082e-04],
        [-2.2237e-02],
        [ 6.7725e-03],
        [ 3.2206e-03],
        [-3.4267e-04]], dtype=torch.float64)


This matches the SK learn values, so thats correct. Now to calculate the neg LL

In [119]:
%%time
mu_post = X @ mn
sigma_post = alpha_ * torch.eye(N) + X @ Sn @ X.T 
L = torch.inverse(sigma_post)

CPU times: user 18.9 s, sys: 804 ms, total: 19.7 s
Wall time: 1.23 s


In [101]:
log_p = 1/2 * (-torch.trace(torch.log(L)) - (y - mu_post).T @ L @ (y - mu_post) - N * np.log(2*np.pi))

In [102]:
print(log_p)

tensor([[158446.8950]], dtype=torch.float64)


Seems close enough to the sklearn implementation, although we can make many things much more efficient (e.g. use woodbury inversion etc). Now let's try and do gradient descent:

In [106]:
alpha_ = torch.nn.Parameter(1/torch.var(y))
lambda_ = torch.nn.Parameter(torch.ones(1))

X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]

  X = torch.tensor(X)
  y = torch.tensor(y)


In [117]:
optimizer = torch.optim.Adam([alpha_, lambda_], lr=1.0)
max_epochs = 1e4

In [118]:
for epoch in torch.arange(max_epochs):
    Sn = torch.inverse(lambda_ * torch.eye(M) + alpha_ * X.T @ X)
    mn = alpha_ * Sn @ X.T @ y
    
    mu_post = X @ mn
    sigma_post = alpha_ * torch.eye(N) + X @ Sn @ X.T 
    L = torch.inverse(sigma_post)

    log_p = 1/2 * (-torch.trace(torch.log(L)) - (y - mu_post).T @ L @ (y - mu_post) - N * np.log(2*np.pi))
    loss = -log_p
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(log_p)

tensor([[4565.8581]], dtype=torch.float64, grad_fn=<MulBackward0>)


KeyboardInterrupt: 

# Own implementation, efficient:

In [13]:
alpha_ = torch.tensor(baseline_noise_precision)
lambda_ = torch.tensor(baseline_prior_precision)

N = X.shape[0]
M = X.shape[1]

Sn = lambda_ * torch.eye(M) + alpha_ * X.T @ X
mn = alpha_ * torch.inverse(Sn) @ X.T @ y

In [14]:
E = alpha_ / 2 * torch.sum((y - X @ mn)**2) + lambda_ / 2 * mn.T @ mn
log_p = M / 2 * torch.log(lambda_) + N / 2 * torch.log(alpha_) - E - 1/2 * torch.trace(torch.log(Sn)) - N / 2 * np.log(2*np.pi)

In [15]:
alpha_ = torch.nn.Parameter(1/torch.var(y))
lambda_ = torch.nn.Parameter(torch.ones(1))

X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]

  X = torch.tensor(X)
  y = torch.tensor(y)


In [16]:
optimizer = torch.optim.Adam([alpha_, lambda_])
max_epochs = 1e5

In [17]:
for epoch in torch.arange(max_epochs):
    A = lambda_ * torch.eye(M) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ / 2 * torch.sum((y - X @ mn)**2) + lambda_ / 2 * mn.T @ mn
    log_p = M / 2 * torch.log(lambda_) + N / 2 * torch.log(alpha_) - E - 1/2 * torch.trace(torch.log(Sn)) - N / 2 * np.log(2*np.pi)
    loss = -log_p
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e4 == 0:
        print(log_p)

tensor([[4417.6791]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[4987.1062]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[5443.0882]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[5826.9458]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[6158.6817]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[6450.5472]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[6710.9202]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[6945.7894]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[7159.5897]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[7355.6952]], dtype=torch.float64, grad_fn=<SubBackward0>)


In [18]:
alpha_

Parameter containing:
tensor(137.3406, dtype=torch.float64, requires_grad=True)

In [147]:
lambda_

Parameter containing:
tensor([11.9059], requires_grad=True)

Cool, it works. Now let's optimize for big and small values:

In [None]:
X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M))

In [None]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [None]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    alpha_ = torch.exp(a)
    
    A = lambda_ * torch.eye(M) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + lambda_ * mn.T @ mn
    loss = E + torch.sum(torch.log(torch.diag(A))) - (M * l + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

In [88]:
mn

tensor([[ 2.8449e-16],
        [ 1.0295e-13],
        [ 1.0000e-01],
        [-2.8892e-15],
        [ 9.5757e-15],
        [-1.0000e+00],
        [ 1.1977e-13],
        [-5.0975e-15],
        [ 4.0412e-14],
        [ 6.7740e-13],
        [-1.1495e-13],
        [ 1.0516e-14]], dtype=torch.float64, grad_fn=<MmBackward>)

In [89]:
lambda_

tensor([11.8797], grad_fn=<ExpBackward>)

In [90]:
alpha_

tensor(2.8472e+83, dtype=torch.float64, grad_fn=<ExpBackward>)

In [91]:
a

Parameter containing:
tensor(192.1610, dtype=torch.float64, requires_grad=True)

# Sparse bayesian learning

In [46]:
X = torch.tensor(X)
y = torch.tensor(y)

N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M, dtype=torch.float64))

  X = torch.tensor(X)
  y = torch.tensor(y)


In [47]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [48]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    alpha_ = torch.exp(a)
    
    A = torch.diag(lambda_) + alpha_ * X.T @ X
    mn = alpha_ * torch.inverse(A) @ X.T @ y
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = E + torch.sum(torch.log(torch.diag(A))) - (torch.sum(l) + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

tensor([[-18080.0447]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.6211]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.9372]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.9372]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.9372]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.9373]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36274.3618]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36225.7195]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.9367]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36281.7933]], dtype=torch.float64, grad_fn=<SubBackward0>)


In [55]:
X = torch.tensor(X) 
y = torch.tensor(y)

In [56]:
N = X.shape[0]
M = X.shape[1]


a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(M, dtype=torch.float64))
threshold = 1e4

In [57]:
optimizer = torch.optim.Adam([a, l], lr=1.0)
max_epochs = 1e4

In [58]:
for epoch in torch.arange(max_epochs):
    lambda_ = torch.exp(l)
    
    
    alpha_ = torch.exp(a)
    
    A = torch.diag(lambda_) + alpha_ * X.T @ X
    mn = (lambda_ < threshold)[:, None] * (alpha_ * torch.inverse(A) @ X.T @ y)
    
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = E + torch.sum(torch.log(torch.diag(A)[lambda_ < threshold])) - (torch.sum(l[lambda_ < threshold]) + N * a)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1e3 == 0:
        print(loss)

tensor([[-18077.0543]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6984]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6984]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6984]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6954]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36176.1362]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6983]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6984]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6966]], dtype=torch.float64, grad_fn=<SubBackward0>)
tensor([[-36199.6392]], dtype=torch.float64, grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [61]:
mn

tensor([[-0.0000],
        [-0.0000],
        [ 0.1002],
        [ 0.0000],
        [-0.0000],
        [-0.9996],
        [-0.0000],
        [ 0.0000],
        [-0.0000],
        [ 0.0000],
        [-0.0000],
        [-0.0000]], dtype=torch.float64, grad_fn=<MulBackward0>)

In [62]:
torch.diag(A)

tensor([5.8439e+07, 9.8302e+09, 2.0407e+07, 2.5665e+09, 8.4677e+07, 4.2334e+05,
        1.4442e+09, 8.6753e+08, 3.2779e+06, 8.6380e+07, 2.9936e+09, 3.7122e+08],
       dtype=torch.float64, grad_fn=<DiagBackward>)

In [63]:
mn

tensor([[-0.0000],
        [-0.0000],
        [ 0.1002],
        [ 0.0000],
        [-0.0000],
        [-0.9996],
        [-0.0000],
        [ 0.0000],
        [-0.0000],
        [ 0.0000],
        [-0.0000],
        [-0.0000]], dtype=torch.float64, grad_fn=<MulBackward0>)

In [64]:
l[lambda_ < threshold]

tensor([4.5996e+00, 3.5780e-03], dtype=torch.float64, grad_fn=<IndexBackward>)

In [65]:
torch.min(lambda_, torch.tensor(1e4, dtype=torch.float64))

tensor([1.0000e+04, 1.0000e+04, 9.9443e+01, 1.0000e+04, 1.0000e+04, 1.0036e+00,
        1.0000e+04, 1.0000e+04, 1.0000e+04, 1.0000e+04, 1.0000e+04, 1.0000e+04],
       dtype=torch.float64, grad_fn=<MinBackward2>)

# SBL alternative

In [66]:
regressor = ARDRegression(fit_intercept=False, compute_score=True, alpha_1=0, alpha_2=0, lambda_1=0, lambda_2=0)
regressor.fit(X, y.squeeze())

ARDRegression(alpha_1=0, alpha_2=0, compute_score=True, fit_intercept=False,
              lambda_1=0, lambda_2=0)

In [67]:
baseline_coeffs = regressor.coef_[:, None]
print(baseline_coeffs)

[[ 0.        ]
 [ 0.        ]
 [ 0.10022563]
 [ 0.        ]
 [ 0.        ]
 [-0.99956811]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]


In [68]:
regressor.sigma_

array([[ 5.18138960e-08, -8.63562959e-08],
       [-8.63562959e-08,  2.49769034e-06]])

In [69]:
regressor.alpha_

3808.452690553647

In [70]:
regressor.lambda_

array([4.53286629e+08, 1.28652545e+04, 9.95497585e+01, 7.63633557e+06,
       2.66219471e+04, 1.00086183e+00, 1.02416991e+04, 2.32670159e+07,
       3.15196590e+04, 4.18342557e+05, 1.65230238e+04, 2.12428560e+07])

In [36]:
beta = torch.tensor(regressor.alpha_)
alpha = torch.tensor(regressor.lambda_)

M = X.shape[1]
N = X.shape[0]

In [39]:
Cinv = beta * (torch.eye(N) - X @ torch.inverse(beta**-1 * torch.diag(alpha) + X.T @ X) @ X.T)

tensor(41298.7758, dtype=torch.float64)

In [44]:
- 1/2 * (N * np.log(2*np.pi) - torch.sum(torch.log(torch.diag(Cinv))) + y.T @ Cinv @ y)

tensor([[13558.2720]], dtype=torch.float64)

In [45]:
regressor.scores_

[18161.96547160334, 18190.11160091524, 18207.03788822612]