In this notebook we fix the Bayes estimator, as it's not working. As a baseline, we take the sk-learn algo. Those are the values we need to hit.

In [None]:
# General imports
import numpy as np
import torch

# DeepMoD stuff
from multitaskpinn import DeepMoD
from multitaskpinn.model.func_approx import Siren, NN
from multitaskpinn.model.library import Library1D
from multitaskpinn.model.constraint import LeastSquares
from multitaskpinn.model.sparse_estimators import Threshold 

from phimal_utilities.data import Dataset
from phimal_utilities.data.burgers import BurgersDelta
from sklearn.linear_model import BayesianRidge

if torch.cuda.is_available():
    device ='cuda'
else:
    device = 'cpu'

# Settings for reproducibility
np.random.seed(42)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


%load_ext autoreload
%autoreload 2

In [2]:
# Making dataset
v = 0.1
A = 1.0

x = np.linspace(-3, 4, 100)
t = np.linspace(0.5, 5.0, 50)
x_grid, t_grid = np.meshgrid(x, t, indexing='ij')
dataset = Dataset(BurgersDelta, v=v, A=A)
X, y = dataset.create_dataset(x_grid.reshape(-1, 1), t_grid.reshape(-1, 1), n_samples=1000, noise=0.2, random=True, normalize=False)
X, y = X.to(device), y.to(device)
        


In [3]:
network = NN(2, [30, 30, 30, 30, 30], 1)
library = Library1D(poly_order=2, diff_order=3) # Library function
estimator = Threshold(0.1) # Sparse estimator 
constraint = LeastSquares() # How to constrain
model = DeepMoD(network, library, estimator, constraint).to(device) # Putting it all in the model

In [4]:
# Getting data out
prediction, time_derivs, thetas = model(X)
t = time_derivs[0].cpu().detach().numpy()
theta = thetas[0].cpu().detach().numpy()

In [5]:
# Getting sklearn results
sk_reg = BayesianRidge(fit_intercept=False, compute_score=True, alpha_1=0, alpha_2=0, lambda_1=0, lambda_2=0)
sk_reg.fit(theta, t)

BayesianRidge(alpha_1=0, alpha_2=0, compute_score=True, fit_intercept=False,
              lambda_1=0, lambda_2=0)

In [6]:
# Precision, noise level
print(sk_reg.alpha_, 1 / sk_reg.alpha_)

624313.8692322593 1.601758425180807e-06


In [7]:
# Precision of prior, std of prior
print(sk_reg.lambda_, 1 / sk_reg.lambda_)

0.014547715291435611 68.73931610338224


In [8]:
# Found coeffs
print(sk_reg.coef_)

[-1.48640479e-02  3.63397636e+00 -1.24136497e+00 -8.73036119e-02
 -3.47944061e-01  2.09853651e+01 -2.43294908e+00 -9.55226105e-01
 -1.40345956e+00 -4.65038034e+00  1.18284549e+01 -1.38515284e+00]


In [9]:
# What would the reg cost be; probably similar to the noise level
np.mean((t - theta @ sk_reg.coef_[:, None])**2) # beware; sk learn coeff output is 1d and will give wrong results!

1.5871360089037915e-06

In [10]:
# And the score aka the log evidence
sk_reg.scores_[-1]

5201.086546976798

Which sounds reasonable but it isn't at all; now let's implement our own and see if we can get it to work.

In [11]:
# Getting data out
prediction, time_derivs, thetas = model(X)
t = time_derivs[0]
Theta = thetas[0]

# Let's use the found alpha and beta to check if the code is good; later we see if we can find it by optimizing
alpha = torch.tensor(sk_reg.lambda_) # precision of weights; follow sk learn init
beta = torch.tensor(sk_reg.alpha_) # precision of noise

M = Theta.shape[1]
N = Theta.shape[0]

In [12]:
# Posterior std and mean
A = torch.eye(M).to(Theta.device) * alpha + beta * Theta.T @ Theta  
mn = beta * torch.inverse(A) @ Theta.T @ t

In [13]:
# Difference between posterior means; seems within numerical acc.
np.abs(mn.detach().cpu().numpy() - sk_reg.coef_[:, None])

array([[0.00117785],
       [0.01039181],
       [0.00786998],
       [0.00532904],
       [0.0152518 ],
       [0.13535671],
       [0.07296802],
       [0.05193206],
       [0.04800464],
       [0.29380104],
       [0.07156858],
       [0.15302918]])

In [14]:
# Difference between posterior std; seems within numerical acc.
np.mean(np.abs(sk_reg.sigma_ - Sn.detach().cpu().numpy()))

NameError: name 'Sn' is not defined

In [16]:
1/2 * (M * torch.log(alpha) 
 + N * torch.log(beta)
 - beta * (t - Theta @ mn).T @ (t - Theta @ mn) - alpha * mn.T @ mn 
 - torch.trace(torch.log(A))
 - N * np.log(2*np.pi))

tensor([[5172.8711]], device='cuda:0', grad_fn=<MulBackward0>)

Which is close enough given numerical accuracy

In [17]:
def neg_marginal_LL(Theta, t, alpha, beta):
    M = Theta.shape[1]
    N = Theta.shape[0]
    
    
    # Posterior std and mean
    A = torch.eye(M).to(Theta.device) * alpha + beta * Theta.T @ Theta  
    mn = beta * torch.inverse(A) @ Theta.T @ t
    
    loss = -1/2 * (M * torch.log(alpha) 
         + N * torch.log(beta)
         - beta * (t - Theta @ mn).T @ (t - Theta @ mn) - alpha * mn.T @ mn 
         - torch.trace(torch.log(A))
         - N * np.log(2*np.pi))
    
    return loss

In [31]:
neg_marginal_LL(Theta, t, torch.tensor(sk_reg.lambda_), torch.tensor(sk_reg.alpha_))

tensor([[-5172.6836]])

In [26]:
#Now let's try to optimize using gradient descent
prediction, time_derivs, thetas = model(X)
t = time_derivs[0].detach().cpu()
Theta = thetas[0].detach().cpu()

alpha = torch.nn.Parameter(torch.log(torch.tensor(1.0))) # precision of weights; follow sk learn init
beta = torch.nn.Parameter(torch.log(torch.tensor(1 / torch.var(t)))) # precision of noise

optimizer = torch.optim.Adam([{'params': alpha}, {'params': beta}])

print(alpha, beta)

Parameter containing:
tensor(1., requires_grad=True) Parameter containing:
tensor(127498.8281, requires_grad=True)


In [27]:
for iteration in torch.arange(100000):
    loss = neg_marginal_LL(Theta, t, torch.exp(alpha), torch.exp(beta))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if iteration % 1000 == 0:
        print(loss)

tensor([[-4698.7212]], grad_fn=<MulBackward0>)
tensor([[-4783.7026]], grad_fn=<MulBackward0>)
tensor([[-4783.8452]], grad_fn=<MulBackward0>)
tensor([[-4783.8462]], grad_fn=<MulBackward0>)
tensor([[-4783.8672]], grad_fn=<MulBackward0>)
tensor([[-4783.8013]], grad_fn=<MulBackward0>)
tensor([[-4783.6919]], grad_fn=<MulBackward0>)
tensor([[-4783.8311]], grad_fn=<MulBackward0>)
tensor([[-4783.8394]], grad_fn=<MulBackward0>)
tensor([[-4783.8853]], grad_fn=<MulBackward0>)
tensor([[-4783.7930]], grad_fn=<MulBackward0>)
tensor([[-4783.8838]], grad_fn=<MulBackward0>)
tensor([[-4783.8857]], grad_fn=<MulBackward0>)
tensor([[-4783.8623]], grad_fn=<MulBackward0>)
tensor([[-4783.8154]], grad_fn=<MulBackward0>)


KeyboardInterrupt: 

In [28]:
alpha

Parameter containing:
tensor(0.0257, requires_grad=True)

In [29]:
beta

Parameter containing:
tensor(127498.8281, requires_grad=True)