In this notebook we compare two different SBL implementations.

In [1]:
# General imports
import numpy as np
import torch

from deepymod.data import Dataset
from deepymod.data.burgers import BurgersDelta
from sklearn.linear_model import BayesianRidge, ARDRegression

import seaborn as sns
from scipy.linalg import pinvh

# Making data

In [2]:
# Making dataset
v = 0.1
A = 1.0

x = np.linspace(-3, 4, 100)
t = np.linspace(0.5, 5.0, 50)
x_grid, t_grid = np.meshgrid(x, t, indexing='ij')
dataset = Dataset(BurgersDelta, v=v, A=A)

y = dataset.time_deriv(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1)) # observations
X = dataset.library(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1), poly_order=2, deriv_order=3) # covariates

print(y.shape, X.shape)

(5000, 1) (5000, 12)


In [3]:
y += np.std(y) * 0.1 * np.random.randn(*y.shape)

In [4]:
np.std(y) * 0.1 

0.016083478332340858

# Baselineregressor.lambda_

In [5]:
regressor = ARDRegression(fit_intercept=False, compute_score=True, alpha_1=0, alpha_2=0, lambda_1=0, lambda_2=0)
regressor.fit(X, y.squeeze())

ARDRegression(alpha_1=0, alpha_2=0, compute_score=True, fit_intercept=False,
              lambda_1=0, lambda_2=0)

In [6]:
baseline_coeffs = regressor.coef_[:, None]
print(baseline_coeffs)

[[ 0.        ]
 [ 0.        ]
 [ 0.10018293]
 [ 0.        ]
 [ 0.        ]
 [-0.99874782]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]


In [7]:
baseline_noise_precision = regressor.alpha_
print(baseline_noise_precision)

3852.5947460652287


In [8]:
baseline_prior_precision = regressor.lambda_
print(baseline_prior_precision[:, None])

[[4.64142273e+06]
 [1.29110689e+04]
 [9.96346357e+01]
 [6.52160533e+07]
 [3.83726463e+04]
 [1.00250658e+00]
 [6.13677488e+04]
 [9.33251871e+06]
 [2.20783309e+06]
 [7.63128154e+04]
 [1.53725985e+06]
 [3.17296456e+09]]


In [9]:
regressor.scores_

[18154.325483371496,
 18188.947560645942,
 18199.884621780693,
 18199.884285206568]

# Homemade Numpy; copied from their source:

In [42]:
# Defining some necessary things
n_samples = X.shape[0]
threshold = 1e4

lambda_ = baseline_prior_precision
alpha_ = baseline_noise_precision

In [43]:
# Finding which terms to keep
keep_lambda = baseline_prior_precision < threshold

In [78]:
%%time
# Getting sigma_ / A^-1 in our notatio 
X_keep = X[:, keep_lambda]
gram = np.dot(X_keep.T, X_keep)
eye = np.eye(gram.shape[0])
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
sigma_ = pinvh(sigma_inv)

CPU times: user 694 µs, sys: 22 µs, total: 716 µs
Wall time: 517 µs


In [45]:
# Getting coeffs
coef_ = alpha_ * np.dot(sigma_, np.dot(X[:, keep_lambda].T, y))

In [46]:
# rmse_
rmse_ = np.sum((y - np.dot(X[:, keep_lambda], coef_)) ** 2)

In [60]:
s=0
s += 0.5 * (np.linalg.slogdet(sigma_)[1] + n_samples * np.log(alpha_) +
            np.sum(np.log(lambda_)))

s -= 0.5 * (alpha_ * rmse_ + (lambda_[keep_lambda][:, None] * coef_**2).sum())

In [61]:
s

18209.886976808906

Boom!

# Simplifying home made version

In [66]:
# Defining some necessary things
n_samples = X.shape[0]
threshold = 1e4

lambda_ = baseline_prior_precision
alpha_ = baseline_noise_precision

In [67]:
# Finding which terms to keep
keep_lambda = baseline_prior_precision < threshold

In [111]:
%%time
# Getting sigma_ / A^-1 in our notatio 
X_keep = X[:, keep_lambda]
sigma_inv = np.diag(lambda_[keep_lambda]) + alpha_ * X_keep.T @ X_keep
sigma_ = np.linalg.inv(sigma_inv)

CPU times: user 300 µs, sys: 9 µs, total: 309 µs
Wall time: 240 µs


In [112]:
# Getting coeffs
coef_ = alpha_ * sigma_ @ X_keep.T @ y

In [113]:
# rmse_
rmse_ = np.sum((y - X_keep @ coef_) ** 2)

In [114]:
s=0
s += 0.5 * (np.sum(np.log(np.diagonal(sigma_))) + n_samples * np.log(alpha_) +
            np.sum(np.log(lambda_)))

s -= 0.5 * (alpha_ * rmse_ + (lambda_[keep_lambda][:, None] * coef_**2).sum())

In [115]:
s

18209.916652261643

In [116]:
np.linalg.slogdet(sigma_)[1]

-29.7648173173302

In [117]:
np.sum(np.log(np.diagonal(sigma_)))

-29.705466411853546

# Pytorch version of scikitlearn

In [156]:
# Defining some necessary things
n_samples = X.shape[0]
threshold = 1e4

lambda_torch = torch.tensor(baseline_prior_precision, dtype=torch.float32)
alpha_torch = torch.tensor(baseline_noise_precision, dtype=torch.float32)

X_torch = torch.tensor(X, dtype=torch.float32)
y_torch = torch.tensor(y, dtype=torch.float32)

  X_torch = torch.tensor(X, dtype=torch.float32)
  y_torch = torch.tensor(y, dtype=torch.float32)


In [157]:
# Finding which terms to keep
keep_lambda = lambda_torch < threshold

In [158]:
%%time
# Getting sigma_ / A^-1 in our notatio 
X_keep = X_torch[:, keep_lambda]
sigma_inv = torch.diag(lambda_torch[keep_lambda]) + alpha_torch * X_keep.T @ X_keep
sigma_ = torch.inverse(sigma_inv)

CPU times: user 1.36 ms, sys: 33 µs, total: 1.39 ms
Wall time: 268 µs


In [159]:
# Getting coeffs
coef_ = alpha_torch * sigma_ @ X_keep.T @ y_torch

In [160]:
coef_

tensor([[ 0.1000],
        [-0.9995]])

In [161]:
# rmse_
rmse_ = torch.sum((y_torch - X_keep @ coef_) ** 2)

In [162]:
s=0
s += 0.5 * (torch.sum(torch.log(torch.diag(sigma_))) + n_samples * torch.log(alpha_torch) +
            torch.sum(torch.log(lambda_torch)))

s -= 0.5 * (alpha_torch * rmse_ + (lambda_torch[keep_lambda][:, None] * coef_**2).sum())

In [163]:
s

tensor(18209.9141)

In [195]:
# Now let's optimize
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

l = torch.nn.Parameter(torch.zeros(12, dtype=torch.float32))
a = torch.nn.Parameter(-torch.log(torch.var(y)))

threshold = 1e4

optimizer = torch.optim.Adam([a, l], lr=1e-2)
max_epochs = 1e4

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [None]:
for epoch in torch.arange(max_epochs):
    alpha_ = torch.min(torch.exp(a), torch.tensor(1e8, dtype=torch.float32))
    lambda_ = torch.min(torch.exp(l), torch.tensor(2e4, dtype=torch.float32))
    
    keep_lambda = lambda_ < threshold
    X_keep = X[:, keep_lambda]
    sigma_inv = torch.diag(lambda_[keep_lambda]) + alpha_ * X_keep.T @ X_keep
    sigma_ = torch.inverse(sigma_inv)
    coef_ = alpha_ * sigma_ @ X_keep.T @ y
    rmse_ = torch.sum((y_torch - X_keep @ coef_) ** 2)
    
    s=0
    s += 0.5 * (torch.sum(torch.log(torch.diag(sigma_))) + n_samples * torch.log(alpha_) +
            torch.sum(torch.log(lambda_)))

    s -= 0.5 * (alpha_ * rmse_ + (lambda_[keep_lambda][:, None] * coef_**2).sum())
    loss = - s
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        print(loss)

Yes it works with the same cost! Now let's make things a little clearer

In [11]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

a = torch.nn.Parameter(torch.zeros(12, dtype=torch.float32)) # initialisation suggested by scikit
b = torch.nn.Parameter(-torch.log(torch.var(y))) # we use Bishops et al notation
n_samples = X.shape[0]
threshold = 1e4 # suggested SK learn value
optimizer = torch.optim.Adam([a, b], lr=1)
max_epochs = 5e3

for epoch in torch.arange(max_epochs):
    alpha_ = torch.min(torch.exp(a), torch.tensor(2e4, dtype=torch.float32)) # we train the log of these things since they're very big
    beta_ = torch.min(torch.exp(b), torch.tensor(1e8, dtype=torch.float32)) # we cap alpha and beta to prevent overflow
    mn = torch.zeros((12, 1))
    
    mask = alpha_ < threshold
    X_keep = X[:, mask]
    A_inv = torch.inverse(torch.diag(alpha_[mask]) + beta_ * X_keep.T @ X_keep)
    mn[mask, :] = beta_ * A_inv @ X_keep.T @ y
    E = beta_ * torch.sum((y - X @ mn)**2) + (alpha_[:, None] * mn**2).sum()
    
    loss =  E - (torch.logdet(A_inv) + n_samples * torch.log(beta_) + torch.sum(torch.log(alpha_))) # we use alpha and lambda since these are bounded
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        print(1/2 * loss) # 1/2 to easily compare with baseline

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


tensor(-9054.5762, grad_fn=<MulBackward0>)
tensor(-18178.1973, grad_fn=<MulBackward0>)
tensor(-18178.1973, grad_fn=<MulBackward0>)
tensor(-18178.1953, grad_fn=<MulBackward0>)
tensor(-18178.1953, grad_fn=<MulBackward0>)


In [12]:
mask

tensor([False, False,  True, False, False,  True, False, False, False, False,
        False, False])

# Now without mask

In [46]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

a = torch.nn.Parameter(torch.zeros(12, dtype=torch.float32)) # initialisation suggested by scikit
b = torch.nn.Parameter(-torch.log(torch.var(y))) # we use Bishops et al notation
n_samples = X.shape[0]
threshold = 1e4 # suggested SK learn value
optimizer = torch.optim.Adam([a, b], lr=1)
max_epochs = 5e3

for epoch in torch.arange(max_epochs):
    alpha_ = torch.exp(a).clamp(max=2e4) # we train the log of these things since they're very big
    beta_ = torch.exp(b).clamp(max=1e8) # we cap alpha and beta to prevent overflow
    #mn = torch.zeros((12, 1))
    
    #mask = alpha_ < threshold
    #X_keep = X[:, mask]
    alpha_inv = torch.diag(torch.nn.functional.threshold(alpha_**-1, 1e-4, 0.))
    A_inv = torch.inverse(torch.eye(12) + beta_ * alpha_inv.T @ X.T @ X) @ alpha_inv
    mn = beta_ * A_inv @ X.T @ y
    E = beta_ * torch.sum((y - X @ mn)**2) + (alpha_[:, None] * mn**2).sum()
    
    loss =  E - (torch.logdet(A_inv) + n_samples * torch.log(beta_) - torch.sum(torch.log(alpha_))) # we use alpha and lambda since these are bounded
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        print(1/2 * loss) # 1/2 to easily compare with baseline

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


tensor(-9054.5762, grad_fn=<MulBackward0>)
tensor(nan, grad_fn=<MulBackward0>)
tensor(nan, grad_fn=<MulBackward0>)
tensor(nan, grad_fn=<MulBackward0>)
tensor(nan, grad_fn=<MulBackward0>)


KeyboardInterrupt: 

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0099, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9766, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,

In [38]:
torch.threshold(alpha_**-1, 

tensor([5.0000e-05, 5.0000e-05, 9.8725e-03, 5.0000e-05, 5.0000e-05, 9.7664e-01,
        5.0000e-05, 5.0000e-05, 5.0000e-05, 5.0000e-05, 5.0000e-05, 5.0000e-05],
       grad_fn=<PowBackward0>)

In [34]:
mn

tensor([[ 2.7049e-04],
        [-1.1722e-03],
        [ 9.9450e-02],
        [ 7.0802e-05],
        [-1.0102e-04],
        [-9.9839e-01],
        [ 3.9744e-03],
        [-6.2164e-04],
        [ 1.7739e-03],
        [-2.8590e-03],
        [-2.9375e-03],
        [ 5.3798e-04]], grad_fn=<MmBackward>)

In [36]:
alpha_

tensor([2.0000e+04, 2.0000e+04, 1.0129e+02, 2.0000e+04, 2.0000e+04, 1.0239e+00,
        2.0000e+04, 2.0000e+04, 2.0000e+04, 2.0000e+04, 2.0000e+04, 2.0000e+04],
       grad_fn=<ClampBackward>)

# Direct

In [12]:
threshold = 1e4

alpha = torch.tensor(baseline_prior_precision, dtype=torch.float32)
beta = torch.tensor(baseline_noise_precision, dtype=torch.float32)
mask = (alpha < threshold)[:, None]
alpha_inv = torch.diag(alpha**-1) * mask

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

N = X.shape[0]
M = X.shape[1]

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [13]:
C = beta**-1 * torch.eye(N) + X @ alpha_inv @ X.T 
p =-1/2 * (N * np.log(2*np.pi) + torch.sum(torch.log(torch.diag(C))) + y.T @ torch.inverse(C) @ y)

In [14]:
p

tensor([[9238.0508]])

# Direct using woodbury

Let's first check this out using the found values to make sure everything is correct.

In [77]:
threshold = 1e4

alpha = torch.tensor(baseline_prior_precision, dtype=torch.float32)
beta = torch.tensor(baseline_noise_precision, dtype=torch.float32)
mask = (alpha < threshold)[:, None]
alpha_inv = torch.diag(alpha**-1) * mask

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

N = X.shape[0]
M = X.shape[1]

In [85]:
%%time
C_inv = beta * (torch.eye(N) - X @ alpha_inv @ torch.inverse(beta**-1 * torch.eye(M)  + X.T @ X @ alpha_inv.T) @ X.T)
p =-1/2 * (N * np.log(2*np.pi) - torch.sum(torch.log(torch.diag(C_inv))) + y.T @ C_inv @ y)

CPU times: user 341 ms, sys: 386 ms, total: 727 ms
Wall time: 45.7 ms


In [86]:
%%time 
p =-1/2 * (N * np.log(2*np.pi) - torch.sum(torch.log(torch.diag(C_inv))) + y.T @ C_inv @ y)

CPU times: user 20.9 ms, sys: 98 µs, total: 21 ms
Wall time: 4.43 ms


In [64]:
print(p)

tensor([[13488.0791]])


In [66]:
a = torch.nn.Parameter(torch.zeros(12))
b = torch.nn.Parameter(-torch.log(torch.var(y)))

optimizer = torch.optim.Adam([a, b], lr=1e-2)
max_epochs = 1e4
threshold=1e-4

In [None]:
#alpha = torch.exp(a)
#beta = torch.exp(b)

mask = (alpha > threshold)[:, None]
alpha_inv = torch.diag(alpha**-1) #* mask

C_inv = beta * (torch.eye(N) - X @ alpha_inv @ torch.inverse(beta**-1 * torch.eye(M)  + X.T @ X @ alpha_inv.T) @ X.T)
p =-1/2 * (N * np.log(2*np.pi) - torch.sum(torch.log(torch.diag(C_inv))) + y.T @ C_inv @ y)

In [63]:
%%time
for epoch in torch.arange(max_epochs):
    alpha = torch.exp(a)
    beta = torch.exp(b)
    
    mask = (alpha > threshold)[:, None]
    alpha_inv = torch.diag(alpha**-1) #* mask
    
    C_inv = beta * (torch.eye(N) - X @ alpha_inv @ torch.inverse(beta**-1 * torch.eye(M)  + X.T @ X @ alpha_inv.T) @ X.T)
    p =-1/2 * (N * np.log(2*np.pi) - torch.sum(torch.log(torch.diag(C_inv))) + y.T @ C_inv @ y)
    
    loss = -p
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        print(loss)

tensor([[-4503.2544]], grad_fn=<NegBackward>)


KeyboardInterrupt: 

In [64]:
beta

tensor(3926.1570, grad_fn=<ExpBackward>)

In [65]:
alpha[:, None]

tensor([[2.4279e+11],
        [1.6602e+00],
        [2.5783e-04],
        [1.1825e-06],
        [1.0773e-05],
        [2.6646e-04],
        [4.7619e-06],
        [2.5246e+03],
        [1.1308e+05],
        [8.9961e+04],
        [3.0908e-03],
        [6.0964e+02]], grad_fn=<UnsqueezeBackward0>)

# Indirect using bishop's formula

In [59]:
threshold = 1e4
a = torch.nn.Parameter(torch.zeros(12))
b = torch.nn.Parameter(-torch.log(torch.var(y)))

In [60]:
optimizer = torch.optim.Adam([a, b], lr=0.1)
max_epochs = 1e4

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [61]:
for epoch in torch.arange(max_epochs):
    alpha = torch.exp(a)
    beta = torch.exp(b)
    
    alpha_inv = torch.diag(alpha**-1) * (alpha < threshold)[:, None]
    A_inv = alpha_inv @ torch.inverse(torch.eye(M) + beta * X.T @ X @ alpha_inv.T)
    mn = beta * A_inv @ X.T @ y
    E = beta * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(alpha) @ mn
    ll = (-1/2 * torch.sum(torch.log(torch.diag(alpha_inv)[mask.squeeze()])) 
            + N / 2 * torch.log(beta)
            - E 
            + 1/2 * torch.sum(torch.log(torch.diag(A_inv)[mask.squeeze()])) 
            - N/2 * np.log(2 * np.pi))
    loss = -ll
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(ll)

tensor([[4457.5073]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [103]:
ll

tensor([[11842.3652]], grad_fn=<SubBackward0>)

In [105]:
mn

tensor([[-1.6781e-04],
        [-4.6799e-03],
        [ 9.9049e-02],
        [ 1.2948e-04],
        [ 2.4755e-03],
        [-9.8386e-01],
        [-2.5128e-03],
        [-6.4078e-05],
        [-1.0621e-02],
        [-1.3537e-02],
        [ 2.4004e-03],
        [-1.2904e-04]], grad_fn=<MmBackward>)

In [107]:
torch.diag(alpha_inv)

tensor([2.8467e+00, 3.5598e+07, 1.9621e-02, 1.2337e+05, 1.6351e+01, 1.9376e+00,
        8.4520e+00, 2.2778e+02, 5.6448e+06, 5.1708e+02, 2.4766e+03, 4.9008e-01],
       grad_fn=<DiagBackward>)

In [109]:
alpha[:, None]

tensor([[3.5129e-01],
        [2.8092e-08],
        [5.0966e+01],
        [8.1056e-06],
        [6.1157e-02],
        [5.1610e-01],
        [1.1832e-01],
        [4.3903e-03],
        [1.7715e-07],
        [1.9339e-03],
        [4.0377e-04],
        [2.0405e+00]], grad_fn=<UnsqueezeBackward0>)

# Old

In [681]:
threshold = 1e4
b = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(12))

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [682]:
optimizer = torch.optim.Adam([b, l], lr=0.1)
max_epochs = 1e4

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [683]:
for epoch in torch.arange(max_epochs):
    beta_ = torch.exp(b)#torch.min(torch.exp(a), torch.tensor(1e8, dtype=torch.float32))
    lambda_ = torch.exp(l)#torch.min(torch.exp(l), torch.tensor(2e4, dtype=torch.float32))

    A = torch.diag(lambda_) + (beta_ * torch.eye(M))@ X.T @ X
    mn = beta_ * torch.inverse(A) @ X.T @ y
    E = beta_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    
    p_reg = -1/2 * (E + torch.sum(torch.log(torch.diag(A))) - (torch.sum(l) + N * b) + N * np.log(2*np.pi))
    loss = -p_reg
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(p_reg)

tensor([[4443.1304]], grad_fn=<MulBackward0>)
tensor([[13650.1328]], grad_fn=<MulBackward0>)
tensor([[13650.1523]], grad_fn=<MulBackward0>)
tensor([[13650.1582]], grad_fn=<MulBackward0>)
tensor([[13650.1582]], grad_fn=<MulBackward0>)
tensor([[13650.1582]], grad_fn=<MulBackward0>)
tensor([[13650.1582]], grad_fn=<MulBackward0>)
tensor([[13650.1602]], grad_fn=<MulBackward0>)
tensor([[13650.1602]], grad_fn=<MulBackward0>)
tensor([[13650.1602]], grad_fn=<MulBackward0>)


In [244]:
l

Parameter containing:
tensor([1.6291e+01, 2.5017e+01, 4.6059e+00, 3.1095e+01, 2.4626e+01, 9.9584e-04,
        2.6842e+01, 3.0178e+01, 2.3517e+01, 2.3218e+01, 1.4204e+01, 2.9579e+01],
       requires_grad=True)

In [245]:
mn

tensor([[ 2.2701e-04],
        [ 6.0470e-09],
        [ 9.9965e-02],
        [ 2.9938e-11],
        [ 1.6955e-09],
        [-9.9998e-01],
        [ 1.7685e-09],
        [ 5.2821e-10],
        [-4.2543e-09],
        [-1.8072e-09],
        [ 7.6543e-04],
        [ 1.3409e-09]], grad_fn=<MmBackward>)

In [659]:
threshold = 1e4
b = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(12))

optimizer = torch.optim.Adam([b, l], lr=1e-2)
max_epochs = 1e3

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [81]:
for epoch in torch.arange(max_epochs):
    beta_ = torch.min(torch.exp(b), torch.tensor(1e8))
    lambda_ = torch.exp(l)
    mask = lambda_ < threshold
    
    A_inv = torch.inverse((torch.eye(M) + beta_ * torch.diag(lambda_**-1 * mask) @ X.T @ X)) @ torch.diag(lambda_**-1 * mask)
    mn = beta_ * A_inv @ X.T @ y
    E = beta_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    
    loss = (E - torch.sum(torch.log(torch.diag(A_inv)))- (torch.sum(l[mask]) + N * b))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(loss)

tensor([[inf]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)
tensor([[nan]], grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [80]:
A_inv = torch.inverse((torch.eye(M) + beta_ * torch.diag(lambda_**-1 * mask) @ X.T @ X)) @ torch.diag(lambda_**-1 * mask)
mn = beta_ * A_inv @ X.T @ y
E = beta_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn

loss = (E - torch.sum(torch.log(torch.diag(A_inv)))- (torch.sum(l[mask]) + N * b))


NameError: name 'beta_' is not defined

In [661]:
l

Parameter containing:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       requires_grad=True)

In [662]:
lambda_

tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<ExpBackward>)

In [663]:
(l < np.log(threshold)) * torch.exp(-l)

tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<MulBackward0>)

In [664]:
torch.diag(lambda_)[:, None]

tensor([[[nan, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., nan, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., nan, 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., nan, 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., nan, 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., nan, 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., nan, 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., nan, 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., nan, 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., nan, 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., nan, 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., nan]]],
       grad_fn=<UnsqueezeBackward0>)

# method

In [82]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

threshold = 1e4
a = torch.nn.Parameter(-torch.log(torch.var(y)))
l = torch.nn.Parameter(torch.zeros(12))

optimizer = torch.optim.Adam([a, l], lr=1e-2)
max_epochs = 1e4


M = 12
N = X.shape[0]


  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [83]:
%%time
for epoch in torch.arange(max_epochs):
    alpha_ = torch.min(torch.exp(a), torch.tensor(1e8, dtype=torch.float32))
    lambda_ = torch.min(torch.exp(l), torch.tensor(2e4, dtype=torch.float32))
    
    A_inv = torch.inverse(torch.diag(lambda_) + alpha_ * X.T @ X) * (lambda_ < threshold)[:, None]
    mn = alpha_ * A_inv @ X.T @ y
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = (E - torch.sum(torch.log(torch.diag(A_inv)[lambda_ < threshold])) - (torch.sum(l[lambda_ < threshold]) + N * a))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(loss)

tensor([[-18147.3574]], grad_fn=<SubBackward0>)
tensor([[-36279.8047]], grad_fn=<SubBackward0>)
tensor([[-36278.3477]], grad_fn=<SubBackward0>)
tensor([[-36278.4648]], grad_fn=<SubBackward0>)
tensor([[-36278.4062]], grad_fn=<SubBackward0>)
tensor([[-36278.6875]], grad_fn=<SubBackward0>)
tensor([[-36278.5234]], grad_fn=<SubBackward0>)


KeyboardInterrupt: 

In [84]:
%%time
A_inv = torch.inverse(torch.diag(lambda_) + alpha_ * X.T @ X) * (lambda_ < threshold)[:, None]
mn = alpha_ * A_inv @ X.T @ y
E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
loss = (E - torch.sum(torch.log(torch.diag(A_inv)[lambda_ < threshold])) - (torch.sum(l[lambda_ < threshold]) + N * a))

CPU times: user 12.2 ms, sys: 1.34 ms, total: 13.5 ms
Wall time: 1.17 ms


In [None]:
torch.min(lambda_, )

In [55]:
A_inv.shape

torch.Size([12, 12])

In [56]:
mn = alpha_ * A_inv @ X.T @ y

In [57]:
mn

tensor([[ 0.0000],
        [ 0.0000],
        [ 0.1006],
        [ 0.0000],
        [ 0.0000],
        [-0.9916],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000]], grad_fn=<MmBackward>)

In [None]:
for epoch in torch.arange(max_epochs):
    alpha_ = torch.min(torch.exp(a), torch.tensor(1e8, dtype=torch.float32))
    lambda_ = torch.min(torch.exp(l), torch.tensor(2e4, dtype=torch.float32))
    
    A_inv = torch.inverse(torch.diag(lambda_) + alpha_ * X.T @ X) * (lambda_ < threshold)[:, None]
    mn = alpha_ * (A_inv @ X.T @ y)
    E = alpha_ * torch.sum((y - X @ mn)**2) + mn.T @ torch.diag(lambda_) @ mn
    loss = (E - torch.sum(torch.log(torch.diag(A_inv)[lambda_ < threshold])) - (torch.sum(l[lambda_ < threshold]) + N * a))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(loss)