In [1]:
import torch

import pyro.distributions.transforms as T
import pyro.distributions as dist
from torch import nn
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
d = 1

x = torch.cat([torch.randn(64, d) - 3, torch.randn(64, d) + 3])

In [3]:
aa = T.affine_autoregressive(d)



In [4]:
base_dist = dist.Normal(torch.zeros(d), torch.ones(d))
num_layers = 8
transform = [T.affine_autoregressive(d) for l in range(num_layers)]


flow_dist = dist.TransformedDistribution(base_dist, transform)


transform_modules = nn.ModuleList([m for m in transform if isinstance(m, nn.Module)])



In [5]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


count_parameters(transform_modules)

144

In [6]:
steps = 2048
optimizer = torch.optim.Adam(transform_modules.parameters(), lr=1e-2)
for step in range(steps+1):
    optimizer.zero_grad()
    loss = -flow_dist.log_prob(x).mean()
#     loss = -log_prob(flow_dist, dataset).mean()
    loss.backward()
    optimizer.step()
    flow_dist.clear_cache()
    
    if step % 500 == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))



step: 0, loss: 3.166220188140869
step: 500, loss: 2.6234347820281982
step: 1000, loss: 2.623434543609619
step: 1500, loss: 2.623434543609619


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    sample = flow_dist.sample((1000,)).squeeze().numpy()
    sns.kdeplot(sample)
    plt.title(f"Mean: {sample.mean()}, STD: {sample.std()}")

In [7]:
import torch

def gradient(y, x, grad_outputs=None):
    """Compute dy/dx @ grad_outputs"""
    if grad_outputs is None:
        grad_outputs = torch.ones_like(y)
    grad = torch.autograd.grad(y, [x], grad_outputs = grad_outputs, create_graph=True)[0]
    return grad

def jacobian(y, x):
    """Compute dy/dx = dy/dx @ grad_outputs; 
    for grad_outputs in [1, 0, ..., 0], [0, 1, 0, ..., 0], ...., [0, ..., 0, 1]"""
    jac = torch.zeros(y.shape[0], x.shape[0]) 
    for i in range(y.shape[0]):
        grad_outputs = torch.zeros_like(y)
        grad_outputs[i] = 1
        jac[i] = gradient(y, x, grad_outputs = grad_outputs)
    return jac

In [75]:
linear = nn.Sequential(nn.Linear(2, 2), nn.LeakyReLU(), nn.Linear(2, 2))

In [76]:
x.requires_grad = True

In [77]:
from torch.autograd.functional import jacobian
def exp_reducer(x):
    return linear(x)


In [78]:
jacobian(exp_reducer, torch.randn(2, 2))

tensor([[[[ 0.0003, -0.0007],
          [ 0.0000,  0.0000]],

         [[-0.0009,  0.0008],
          [ 0.0000,  0.0000]]],


        [[[ 0.0000,  0.0000],
          [-0.0107, -0.0085]],

         [[ 0.0000,  0.0000],
          [-0.0280, -0.0187]]]])

In [79]:
from torch import autograd

In [80]:
def batch_jacobian(func, x, create_graph=False):
  # x in shape (Batch, Length)
  def _func_sum(x):
    return func(x).sum(dim=0)
  return autograd.functional.jacobian(_func_sum, x, create_graph=create_graph, vectorize=True).permute(1,0,2)

In [81]:
J = batch_jacobian(exp_reducer, torch.rand(2, 2))

In [82]:
J.shape

torch.Size([2, 2, 2])

In [83]:
torch.linalg.det(J)

tensor([-3.9905e-05, -3.9905e-05])