In this notebook we implement a Bayesian version of DeepMoD.

In [1]:
# General imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from deepmod_l1.analytical import theta_analytical

#Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# DeepMoD stuff
from deepymod_torch.DeepMod import DeepMod, build_network
from deepymod_torch.library_functions import library_basic
from deepymod_torch.utilities import create_deriv_data
from deepymod_torch.output import progress

# Remainder imports
from os import listdir, path, getcwd

# Setting cuda
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

# Settings for reproducibility
np.random.seed(42)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Defining output folder
output_folder = getcwd()

%load_ext autoreload
%autoreload 2

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Making dataset

In [2]:
data = np.load('../../tests/data/burgers.npy', allow_pickle=True).item()

In [3]:
X = np.transpose((data['t'].flatten(), data['x'].flatten()))
y = np.real(data['u']).reshape((data['u'].size, 1))

In [4]:
number_of_samples = 500

idx = np.random.permutation(y.size)
X_train = torch.tensor(X[idx, :][:number_of_samples], dtype=torch.float32, requires_grad=True)
y_train = torch.tensor(y[idx, :][:number_of_samples], dtype=torch.float32)

# Adapting deepmod

We just need to slightly modify the fitting layer:

In [5]:
class Library(nn.Module):
    '''Abstract baseclass for library-as-layer. Child requires theta function (see library_functions). '''
    def __init__(self, input_dim, output_dim, diff_order):
        super().__init__()
        self.diff_order = diff_order
        self.total_terms = self.terms(input_dim, output_dim, self.diff_order)

    def forward(self, input):
        '''Calculates output.'''
        time_deriv_list, theta = self.theta(input)
        return input, time_deriv_list, theta

    def terms(self, input_dim, output_dim, max_order):
        '''Calculates the number of terms the library produces'''
        sample_data = (torch.ones((1, output_dim), dtype=torch.float32), torch.ones((1, max_order, input_dim, output_dim), dtype=torch.float32)) # we run a single forward pass on fake data to infer shapes
        total_terms = self.theta(sample_data)[1].shape[1]

        return total_terms

In [6]:
class library_basic(Library):
    '''Implementation of library layer. Inherets from Library layer.'''
    def __init__(self, input_dim, output_dim, diff_order, poly_order):
        self.poly_order = poly_order
        super().__init__(input_dim, output_dim, diff_order)
    
    def theta(self, input):
        '''Calculates the library and time deriv from NN output'''
        X, dX = input
        samples = X.shape[0]

        # Time derivatives
        dt = dX[:, 0, :1, :]
        time_deriv_list = torch.unbind(dt, dim=2)

        # Polynomial part
        u = torch.ones_like(X)[:, None, :]
        for order in torch.arange(1, self.poly_order+1):
            u = torch.cat((u, u[:, order-1:order, :] * X[:, None, :]), dim=1)
        poly_list = torch.unbind(u, dim=2) #list with each entry corresponding to eq.

        # Derivative part
        dx = dX[:, :, 1:, :]
        deriv_list = [torch.cat((torch.ones((samples, 1)), eq.reshape(samples, -1)), dim=1) for eq in torch.unbind(dx, dim=3)] #list with each entry corresponding to eq.
        
        # Combining to make  theta
        if len(poly_list) == 1:
            theta = torch.matmul(poly_list[0][:, :, None], deriv_list[0][:, None, :]).reshape(samples, -1) # If we have a single output, we simply calculate and flatten matrix product between polynomials and derivatives to get library
        else:
            theta_uv = torch.cat([torch.matmul(u[:, :, None], v[:, None, :]).reshape(samples, -1) for u, v in combinations(poly_list, 2)], 1)  # calculate all unique combinations between polynomials
            theta_dudv = torch.cat([torch.matmul(du[:, :, None], dv[:, None, :]).reshape(samples, -1)[:, 1:] for du, dv in combinations(deriv_list, 2)], 1) # calculate all unique combinations of derivatives
            theta_udu = torch.cat([torch.matmul(u[:, 1:, None], du[:, None, 1:]).reshape(samples, -1) for u, du in product(poly_list, deriv_list)], 1)  # calculate all unique products of polynomials and derivatives
            theta = torch.cat([theta_uv, theta_dudv, theta_udu], dim=1)

        return time_deriv_list, theta

# Setting and running deepmod

In [23]:
config = {'input_dim': 2, 'hidden_dim': 20, 'layers': 5, 'output_dim': 1, 'library_function': library_basic, 'library_args':{'poly_order': 2, 'diff_order': 2}}

X_input = create_deriv_data(X_train, config['library_args']['diff_order'])

In [24]:
model = build_network(**config)

Before we'll do PI training, we need a decent estimate, otherwise we'll get shite posteriors. (There's no noise for now anyway).

In [25]:
optimizer = torch.optim.Adam(model.parameters())
max_iterations = 5000

In [28]:
print('| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |')
for iteration in torch.arange(0, max_iterations + 1):
    prediction, time_deriv_list, theta = model(X_input)
    loss_mse = torch.mean((prediction[0] - y_train)**2)
    loss = loss_mse 
        
    # Writing
    if iteration % 100 == 0:
        progress(iteration, 0, max_iterations, loss.item(), loss_mse.item(), 0, 0)
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |
       5000    100.00%               0s   2.71e-06   2.71e-06   0.00e+00   0.00e+00 

With that done, let's do a fit to see the result, both using least squares and bayesian

In [31]:
np.linalg.lstsq(theta.cpu().detach().numpy(), time_deriv_list[0].cpu().detach().numpy(), rcond=None)[0]

array([[ 4.7115453e-05],
       [ 5.2083950e-03],
       [ 9.2110246e-02],
       [ 1.9702425e-03],
       [-1.1246353e+00],
       [ 5.5794012e-02],
       [-4.1911104e-03],
       [ 2.2810708e-01],
       [-8.9375928e-02]], dtype=float32)

Which doesn't seem too bad. Now let's do some bayezzz

## Type II maximalization on deepmod results

In [42]:
X = theta.detach().clone()
y = time_deriv_list[0].detach().clone()

M = X.shape[1]
N = X.shape[0]

In [43]:
cov = nn.Parameter(torch.tensor(1.0))
noise = nn.Parameter(torch.tensor(1.0))

optimizer = torch.optim.Adam([cov, noise])

In [45]:
for it in np.arange(5000):
    alpha = 1/cov
    beta = 1/noise**2
    A = alpha * torch.eye(X.shape[1]) + beta * X.T @ X
    mn = beta * torch.inverse(A) @ X.T @ y
    E = beta/2 * (y - X @ mn).T @ (y - X @ mn) + alpha/2 * mn.T @ mn
    
    loss = -1 * (M/2*torch.log(alpha) + N/2 * torch.log(beta) - E - torch.sum(torch.log(torch.diag(torch.cholesky(A)))))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if it %1000 == 0:
        print(loss, cov, noise)

tensor([[-2381.2461]], grad_fn=<MulBackward0>) Parameter containing:
tensor(0.1454, requires_grad=True) Parameter containing:
tensor(0.0048, requires_grad=True)
tensor([[-2381.2461]], grad_fn=<MulBackward0>) Parameter containing:
tensor(0.1454, requires_grad=True) Parameter containing:
tensor(0.0048, requires_grad=True)
tensor([[-2381.2461]], grad_fn=<MulBackward0>) Parameter containing:
tensor(0.1454, requires_grad=True) Parameter containing:
tensor(0.0048, requires_grad=True)
tensor([[-2381.2458]], grad_fn=<MulBackward0>) Parameter containing:
tensor(0.1454, requires_grad=True) Parameter containing:
tensor(0.0048, requires_grad=True)
tensor([[-2381.2461]], grad_fn=<MulBackward0>) Parameter containing:
tensor(0.1454, requires_grad=True) Parameter containing:
tensor(0.0048, requires_grad=True)


We now get a the following means:

In [46]:
mn

tensor([[ 6.1631e-05],
        [ 2.9984e-03],
        [ 9.1805e-02],
        [ 2.1081e-03],
        [-1.1128e+00],
        [ 5.6161e-02],
        [-4.7116e-03],
        [ 2.1575e-01],
        [-8.9609e-02]], grad_fn=<MmBackward>)

In [50]:
torch.sqrt(torch.diagonal(torch.inverse(A)))

tensor([0.0003, 0.0072, 0.0036, 0.0048, 0.0357, 0.0189, 0.0116, 0.0382, 0.0212],
       grad_fn=<SqrtBackward>)

Which seems pretty similar to the other results :-)

## Optimizing deepmod with bayesian inference

I think we can just throw the bayesian loss in there with the mse. The massive difference in loss might be problematic though, but let's see. We also won't update the prior to be the previous posterior.

In [52]:
optimizer = torch.optim.Adam([{'params': model.parameters()}, {'params':[cov, noise]}])
max_iterations = 1000

In [55]:
print('| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |')
for iteration in torch.arange(0, max_iterations + 1):
    # NN
    prediction, time_deriv_list, theta = model(X_input)
    loss_mse = torch.mean((prediction[0] - y_train)**2)
    
    ## Bayes
    alpha = 1/cov
    beta = 1/noise**2
    A = alpha * torch.eye(theta.shape[1]) + beta * theta.T @ theta
    mn = beta * torch.inverse(A) @ theta.T @ time_deriv_list[0]
    E = beta/2 * (time_deriv_list[0] - theta @ mn).T @ (time_deriv_list[0] - theta @ mn) + alpha/2 * mn.T @ mn
    
    loss_bayes = -1 * (M/2*torch.log(alpha) + N/2 * torch.log(beta) - E - torch.sum(torch.log(torch.diag(torch.cholesky(A)))))
    
    
    # Full loss
    loss = loss_mse + loss_bayes
        
    # Writing
    if iteration % 100 == 0:
        progress(iteration, 0, max_iterations, loss.item(), loss_mse.item(), loss_bayes.item(), 0)
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |
       1000    100.00%               0s   -2.51e+03   3.53e-01   -2.51e+03   0.00e+00 

In [56]:
mn

tensor([[ 0.0065],
        [-0.2509],
        [-0.0021],
        [-0.0091],
        [-0.1137],
        [ 0.0441],
        [-0.0232],
        [ 0.0882],
        [ 0.0974]], grad_fn=<MmBackward>)

So that doens't work, probably because its vastly different magnitudes. How should we scale them?? Maybe use a probabilistic fuction generator as well?

# Optimizing Bayes + MSE from the start

Let's add some noise (maybe cholesky fails when theres no noise)

In [21]:
number_of_samples = 500

idx = np.random.permutation(y.size)
X_train = torch.tensor(X[idx, :][:number_of_samples], dtype=torch.float32, requires_grad=True)
y_train = torch.tensor(y[idx, :][:number_of_samples] + np.random.normal(scale=0.01, size=y[idx, :][:number_of_samples].shape), dtype=torch.float32)

In [22]:
config = {'input_dim': 2, 'hidden_dim': 20, 'layers': 5, 'output_dim': 1, 'library_function': library_basic, 'library_args':{'poly_order': 2, 'diff_order': 2}}

X_input = create_deriv_data(X_train, config['library_args']['diff_order'])

In [23]:
cov = nn.Parameter(torch.tensor(1.0))
noise = nn.Parameter(torch.tensor(1.0))

model = build_network(**config)

optimizer = torch.optim.Adam([{'params': model.parameters()}, {'params':[cov, noise]}])
max_iterations = 10000

In [24]:
print('| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |')
for iteration in torch.arange(0, max_iterations + 1):
    # NN
    prediction, time_deriv_list, theta = model(X_input)
    loss_mse = torch.mean((prediction[0] - y_train)**2)
    
    ## Bayes
    M = theta.shape[1]
    N = theta.shape[0]

    alpha = 1/cov**2
    beta = 1/noise**2
    A = alpha * torch.eye(theta.shape[1]) + beta * theta.T @ theta
    mn = beta * torch.inverse(A) @ theta.T @ time_deriv_list[0]
    E = beta/2 * (time_deriv_list[0] - theta @ mn).T @ (time_deriv_list[0] - theta @ mn) + alpha/2 * mn.T @ mn
    
    loss_bayes = -1/N * (M/2*torch.log(alpha) + N/2 * torch.log(beta) - E - torch.sum(torch.log(torch.diag(torch.cholesky(A)))))
    
    
    # Full loss
    loss = loss_mse + loss_bayes
        
    # Writing
    if iteration % 100 == 0:
        progress(iteration, 0, max_iterations, loss.item(), loss_mse.item(), loss_bayes.item(), 0)
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |
      10000    100.00%               0s   -3.57e+00   4.02e-02   -3.61e+00   0.00e+00 

Rescaling bayes loss with the number of samples definitely helps a lot.

In [25]:
cov

Parameter containing:
tensor(0.0229, requires_grad=True)

In [26]:
noise

Parameter containing:
tensor(-0.0269, requires_grad=True)

Maybe optimizing for the hyperparameters is not correct? Maybe optimize with set hyperparameters and then after optimize the hyperparameters?

# Optimizing DeepMoD + Bayes with fixed hyperparams.

In [43]:
number_of_samples = 500

idx = np.random.permutation(y.size)
X_train = torch.tensor(X[idx, :][:number_of_samples], dtype=torch.float32, requires_grad=True)
y_train = torch.tensor(y[idx, :][:number_of_samples], dtype=torch.float32)

In [44]:
config = {'input_dim': 2, 'hidden_dim': 20, 'layers': 5, 'output_dim': 1, 'library_function': library_basic, 'library_args':{'poly_order': 2, 'diff_order': 2}}

X_input = create_deriv_data(X_train, config['library_args']['diff_order'])

In [51]:
cov = nn.Parameter(torch.tensor(0.1))
noise = nn.Parameter(torch.tensor(0.0001))

model = build_network(**config)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
max_iterations = 10000

In [55]:
print('| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |')
for iteration in torch.arange(0, max_iterations + 1):
    # NN
    prediction, time_deriv_list, theta = model(X_input)
    loss_mse = torch.mean((prediction[0] - y_train)**2)
    
    ## Bayes
    M = theta.shape[1]
    N = theta.shape[0]

    alpha = 1/cov**2
    beta = 1/noise**2
    A = alpha * torch.eye(theta.shape[1]) + beta * theta.T @ theta
    mn = beta * torch.inverse(A) @ theta.T @ time_deriv_list[0]
    E = beta/2 * (time_deriv_list[0] - theta @ mn).T @ (time_deriv_list[0] - theta @ mn) + alpha/2 * mn.T @ mn
    
    loss_bayes = -10**-8/N * (M/2*torch.log(alpha) + N/2 * torch.log(beta) - E - torch.sum(torch.log(torch.diag(torch.cholesky(A)))))
    
    
    # Full loss
    loss = loss_mse + loss_bayes
        
    # Writing
    if iteration % 100 == 0:
        progress(iteration, 0, max_iterations, loss.item(), loss_mse.item(), loss_bayes.item(), 0)
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

| Iteration | Progress | Time remaining |     Cost |      MSE |      Reg |       L1 |
       1900     19.00%      6323589632s   1.77e-05   9.72e-06   7.99e-06   0.00e+00 

KeyboardInterrupt: 

In [56]:
mn

tensor([[ 2.2406e-04],
        [-2.3179e-02],
        [ 8.6250e-02],
        [ 1.7514e-02],
        [-1.0157e+00],
        [-2.0395e-02],
        [-6.8224e-02],
        [ 2.9492e-01],
        [-2.0452e-02]], grad_fn=<MmBackward>)

In [60]:
torch.sqrt(torch.diag(torch.inverse(A)))

tensor([5.8054e-06, 1.6205e-04, 6.5226e-05, 1.3348e-04, 8.1526e-04, 3.1213e-04,
        3.5295e-04, 1.0502e-03, 3.9953e-04], grad_fn=<SqrtBackward>)

So it doesn't necessarily blow up... The problem is in the scaling. 