In [16]:
import torch
from torch.autograd import Variable, grad
from torch.utils.data import DataLoader, TensorDataset

from collections import OrderedDict

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os

os.chdir("/home/s2113174/Projects-1")

#np.random.seed(1234)

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [17]:
# Deep neural network
class DNN(torch.nn.Module):
    def __init__(self, layers):
        super(DNN, self).__init__()

        # Number of layers
        self.depth = len(layers) - 1
        
        # Activation Function
        self.activation = torch.nn.Tanh
        
        # The following loop organized the layers of the NN         
        layer_list = list()
        for i in range(self.depth - 1): 
            layer_list.append(
                ('layer_%d' % i, torch.nn.Linear(layers[i], layers[i+1])))
            layer_list.append(('activation_%d' % i, self.activation()))
        layer_list.append(
            ('layer_%d' % (self.depth - 1), torch.nn.Linear(layers[-2], layers[-1]))
        )
        layerDict = OrderedDict(layer_list)
        
        # Deploy layers
        self.layers = torch.nn.Sequential(layerDict)

        # for param in self.parameters():
        #     if len(param.shape) > 1:
        #         torch.nn.init.xavier_normal_(param)

    def forward(self, x):
        out = self.layers(x)
        return out

In [18]:
def test_set(max_space = 2,obs = 1,param = 1 ,mean = 0,std = 0):

    t= np.linspace(0,max_space,obs)

    sol = (param/ (2*np.pi))*np.sin(2*np.pi*t)

    noise_sol_test = sol + np.random.normal(mean,std, len(t))

    return t,noise_sol_test


def data(max_space = 2,obs = 1,param = 1 ,mean = 0,std = 0.1):

    t= np.linspace(0,max_space,obs)

    sol = (param/ (2*np.pi))*np.sin(2*np.pi*t)

    noise_sol_test = sol + np.random.normal(mean,std, len(t))

    x,y = torch.tensor(t).float().reshape(-1,1),torch.tensor(noise_sol_test).float().reshape(-1,1)
    
    X_u_train = TensorDataset(x,y)

    X_u_train = DataLoader(X_u_train,batch_size=obs)

    return X_u_train

In [19]:
nobs = 2
t, y = test_set(obs = nobs)

layers = [1] + 1*[10] + [1]
model = DNN(layers)
loss = torch.nn.MSELoss(reduction ='mean')

In [20]:
from backpack import backpack, extend
from backpack.extensions import DiagHessian, DiagGGNExact

model_ = extend(model, use_converter=True)
lossfunc_ = extend(loss)

loss_ = lossfunc_(model_(Variable(torch.tensor(t).float().reshape(-1,1),requires_grad=True)), torch.tensor(y).float().reshape(-1,1))

with backpack(DiagHessian(), DiagGGNExact()):
    loss_.backward()

for name, param in model_.named_parameters():
    print(name)
    print(".diag_ggn_exact.shape:   ", param.diag_ggn_exact)


layers.layer_0.weight
.diag_ggn_exact.shape:    tensor([[9.6760e-07],
        [1.8531e-02],
        [6.1228e-03],
        [1.4710e-04],
        [2.2455e-01],
        [6.6965e-02],
        [1.6396e-01],
        [2.0822e-04],
        [5.3360e-04],
        [2.2814e-01]])
layers.layer_0.bias
.diag_ggn_exact.shape:    tensor([0.0003, 0.0100, 0.0158, 0.0004, 0.1111, 0.0459, 0.0921, 0.0045, 0.0192,
        0.1093])
layers.layer_1.weight
.diag_ggn_exact.shape:    tensor([[1.5238, 0.5709, 0.9521, 0.7752, 0.1495, 0.2799, 0.4226, 0.8920, 1.1312,
         0.4282]])
layers.layer_1.bias
.diag_ggn_exact.shape:    tensor([2.])


In [21]:
from laplace import Laplace

la = Laplace(model, 'regression', subset_of_weights='last_layer', hessian_structure='diag')

dta = data(obs = nobs)

la.fit(dta)

print(la.H)

x,y = next(iter(dta))

#la.model.forward_with_features(x)

fm, varl = la(x)

print(varl)

tensor([1.5238, 0.5709, 0.9521, 0.7752, 0.1495, 0.2799, 0.4226, 0.8920, 1.1312,
        0.4282, 2.0000])
tensor([[[1.3187]],

        [[3.1874]]])


In [22]:
# a dict to store the activations
forw_activation = {}
def forw_getActivation(name):
  # the hook signature
  def hook(model, input, output):
    forw_activation[name] = output.detach()
  return hook

h1 = model.layers[1].register_forward_hook(forw_getActivation('layers.activation_0'))

In [23]:
t = Variable(torch.tensor(t).float().reshape(-1,1),requires_grad=True)
y_ = model(t)

h1.remove()

Loss = loss(y_,torch.tensor(y).float().reshape(-1,1))

df_f = grad(Loss, y_, create_graph=True)[0]

ddf_ff = grad(df_f, y_, torch.ones_like(df_f))[0]

  Loss = loss(y_,torch.tensor(y).float().reshape(-1,1))


In [24]:
wt,bias = model.layers[-1].weight, model.layers[-1].bias

param_MAP = torch.cat((wt,bias.reshape(1,1)),1) 

nparam = param_MAP.reshape(-1).shape

In [25]:
df_theta = torch.cat((forw_activation['layers.activation_0'],torch.ones_like(ddf_ff)),1)

H = (nobs/2)*torch.sum(df_theta*ddf_ff*df_theta,axis=0)

print(H)

print(np.linalg.norm((H-la.H),ord=2))
print(np.allclose(H,la.H))

tensor([1.5238, 0.5709, 0.9521, 0.7752, 0.1495, 0.2799, 0.4226, 0.8920, 1.1312,
        0.4282, 2.0000])
2.8429605e-07
True


In [11]:
f, phi = y_,forw_activation['layers.activation_0']

bsize = phi.shape[0]
output_size = f.shape[-1]

# calculate Jacobians using the feature vector 'phi'
identity = torch.eye(output_size, device=x.device).unsqueeze(0).tile(bsize, 1, 1)
# Jacobians are batch x output x params
Js = torch.einsum('kp,kij->kijp', phi, identity).reshape(bsize, output_size, -1)
Js = torch.cat([Js, identity], dim=2)

In [12]:
def sigma_noise():
    return _sigma_noise

def _H_factor():
    sigma2 = sigma_noise().square()
    return 1 / sigma2 / temperature

def prior_precision_diag(prior_precision,n_params):
    """Obtain the diagonal prior precision \\(p_0\\) constructed from either
    a scalar, layer-wise, or diagonal prior precision.

    Returns
    -------
    prior_precision_diag : torch.Tensor
    """
    if len(prior_precision) == 1:  # scalar
        return prior_precision * torch.ones(n_params, device=device)

    elif len(prior_precision) == n_params:  # diagonal
        return prior_precision

    # elif len(prior_precision) == n_layers:  # per layer
    #     n_params_per_layer = parameters_per_layer(self.model)
    #     return torch.cat([prior * torch.ones(n_params, device=self._device) for prior, n_params
    #                         in zip(self.prior_precision, n_params_per_layer)])

    # else:
    #     raise ValueError('Mismatch of prior and model. Diagonal, scalar, or per-layer prior.')

_sigma_noise=torch.tensor([1])
temperature=torch.tensor([1])
prior_precision=torch.tensor([1])

prior_precision_diag = prior_precision_diag(prior_precision,nparam)

In [13]:
post_presicion = _H_factor() * H + prior_precision_diag


post_variance = 1 / post_presicion


functional_var = torch.einsum('ncp,p,nkp->nck', Js, post_variance, Js)

print(functional_var.flatten())

print(np.linalg.norm((functional_var.flatten()-varl.flatten()),ord=2))

tensor([1.6785, 2.7301])
0.0


## last-layer Full Hessian 


In [14]:
model = DNN(layers)

la1 = Laplace(model, 'regression', subset_of_weights='last_layer', hessian_structure='full')

dta = data(obs = nobs)

la1.fit(dta)

print(la1.H)

x,y = next(iter(dta))

#la.model.forward_with_features(x)

fm, varl = la1(x)

print(varl)

tensor([[ 1.2130, -0.7153, -0.8653, -0.4236,  0.4071,  0.9955, -0.3935, -0.6093,
         -1.3308, -1.0323,  1.4968],
        [-0.7153,  0.4655,  0.5691,  0.4405, -0.4495, -0.6220,  0.1000,  0.5000,
          0.7534,  0.6137, -0.8008],
        [-0.8653,  0.5691,  0.6966,  0.5589, -0.5724, -0.7573,  0.1029,  0.6242,
          0.9073,  0.7432, -0.9576],
        [-0.4236,  0.4405,  0.5589,  0.9796, -1.0555, -0.5002, -0.4386,  0.8266,
          0.3284,  0.3825, -0.1659],
        [ 0.4071, -0.4495, -0.5724, -1.0555,  1.1397,  0.5017,  0.5005, -0.8786,
         -0.2970, -0.3707,  0.1105],
        [ 0.9955, -0.6220, -0.7573, -0.5002,  0.5017,  0.8450, -0.2172, -0.6127,
         -1.0671, -0.8512,  1.1629],
        [-0.3935,  0.1000,  0.1029, -0.4386,  0.5005, -0.2172,  0.5265, -0.2274,
          0.5261,  0.3196, -0.7326],
        [-0.6093,  0.5000,  0.6242,  0.8266, -0.8786, -0.6127, -0.2274,  0.7591,
          0.5679,  0.5348, -0.4885],
        [-1.3308,  0.7534,  0.9073,  0.3284, -0.2970, -1

In [15]:
from torch.distributions.multivariate_normal import _precision_to_scale_tril


# a dict to store the activations
forw_activation = {}
def forw_getActivation(name):
  # the hook signature
  def hook(model, input, output):
    forw_activation[name] = output.detach()
  return hook

h1 = model.layers[1].register_forward_hook(forw_getActivation('layers.activation_0'))

t = Variable(torch.tensor(t).float().reshape(-1,1),requires_grad=True)
y_ = model(t)

h1.remove()

Loss = loss(y_,torch.tensor(y).float().reshape(-1,1))

df_f = grad(Loss, y_, create_graph=True)[0]

ddf_ff = grad(df_f, y_, torch.ones_like(df_f))[0]

wt,bias = model.layers[-1].weight, model.layers[-1].bias

param_MAP = torch.cat((wt,bias.reshape(1,1)),1) 

nparam = param_MAP.reshape(-1).shape

df_theta = torch.cat((forw_activation['layers.activation_0'],torch.ones_like(ddf_ff)),1)

H = (nobs/2)*torch.sum(torch.einsum('bc,bd->bcd', df_theta, df_theta),axis=0)

print(H)

print(np.linalg.norm((H-la1.H),ord=2))


f, phi = y_,forw_activation['layers.activation_0']

bsize = phi.shape[0]
output_size = f.shape[-1]

# calculate Jacobians using the feature vector 'phi'
identity = torch.eye(output_size, device=x.device).unsqueeze(0).tile(bsize, 1, 1)
# Jacobians are batch x output x params
Js = torch.einsum('kp,kij->kijp', phi, identity).reshape(bsize, output_size, -1)
Js = torch.cat([Js, identity], dim=2)



post_presicion = _H_factor() * H + torch.diag(prior_precision_diag)

post_scale = _precision_to_scale_tril(post_presicion)

post_cov = post_scale @ post_scale.T

functional_var = torch.einsum('ncp,pq,nkq->nck', Js, post_cov, Js)

print(functional_var.flatten())

print(np.linalg.norm((functional_var.flatten()-varl.flatten()),ord=2))

tensor([[ 1.2130, -0.7153, -0.8653, -0.4236,  0.4071,  0.9955, -0.3935, -0.6093,
         -1.3308, -1.0323,  1.4968],
        [-0.7153,  0.4655,  0.5691,  0.4405, -0.4495, -0.6220,  0.1000,  0.5000,
          0.7534,  0.6137, -0.8008],
        [-0.8653,  0.5691,  0.6966,  0.5589, -0.5724, -0.7573,  0.1029,  0.6242,
          0.9073,  0.7432, -0.9576],
        [-0.4236,  0.4405,  0.5589,  0.9796, -1.0555, -0.5002, -0.4386,  0.8266,
          0.3284,  0.3825, -0.1659],
        [ 0.4071, -0.4495, -0.5724, -1.0555,  1.1397,  0.5017,  0.5005, -0.8786,
         -0.2970, -0.3707,  0.1105],
        [ 0.9955, -0.6220, -0.7573, -0.5002,  0.5017,  0.8450, -0.2172, -0.6127,
         -1.0671, -0.8512,  1.1629],
        [-0.3935,  0.1000,  0.1029, -0.4386,  0.5005, -0.2172,  0.5265, -0.2274,
          0.5261,  0.3196, -0.7326],
        [-0.6093,  0.5000,  0.6242,  0.8266, -0.8786, -0.6127, -0.2274,  0.7591,
          0.5679,  0.5348, -0.4885],
        [-1.3308,  0.7534,  0.9073,  0.3284, -0.2970, -1

  t = Variable(torch.tensor(t).float().reshape(-1,1),requires_grad=True)
  Loss = loss(y_,torch.tensor(y).float().reshape(-1,1))
