In [1]:
%cd ../

/scratch/km817/iREC


In [88]:
import numpy as np
import pandas as pd
np.random.seed(0)
#!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data" --no-check-certificate
#!curl -O "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 
data = pd.read_csv('winequality-red.csv', header=1, delimiter=';').values
data.shape

(1598, 12)

In [3]:
import torch
import pyro
from torch import nn
import pyro.distributions as dist
from pyro.infer import HMC, MCMC, SVI, NUTS, TraceMeanField_ELBO
from pyro import poutine
from sklearn.datasets import load_boston
import numpy as np
import torch.nn.functional as F
from tqdm.notebook import trange
from rec.utils import kl_estimate_with_mc

In [152]:
x_ = data[:, :-1]
y_ = data[:, -1]

In [153]:
test_splits_idxs = []
for d in range(x_.shape[-1]):
    sorted_x = np.argsort(x_[:,d], axis=-1)
    total_points = sorted_x.shape[0]
    lower_third = total_points // 3
    upper_third = total_points * 2 // 3
    test_index = sorted_x[lower_third: upper_third]
    test_splits_idxs.append(test_index)

In [154]:
test_splits_x, test_splits_y = [], []
train_splits_x, train_splits_y = [], []
for d in range(x_.shape[-1]):
    a = np.arange(x_.shape[0])
    test_index = test_splits_idxs[d]
    train_index = np.delete(a, test_index, axis=0)
    x_train = x_[train_index]
    y_train = y_[train_index]
    x_test = x_[test_index][:]
    y_test = y_[test_index][:]
    x_m = x_train.mean(0)
    x_s = x_train.std(0)
    x_train = (x_train - x_m) / x_s
    x_test = (x_test - x_m) / x_s
    test_splits_x.append(x_test)
    test_splits_y.append(y_test)
    train_splits_x.append(x_train)
    train_splits_y.append(y_train)

In [164]:
D_in = x_train.shape[1]
x_train = torch.FloatTensor(np.array(train_splits_x))
y_train = torch.FloatTensor(np.array(train_splits_y))
x_test= torch.FloatTensor(np.array(test_splits_x))
y_test = torch.FloatTensor(np.array(test_splits_y))

In [181]:
def regression_model(x, y=None, weight_samples=None, in_size=D_in, num_nodes=10, out_size=1, ELBO_BETA=1.):
    # sample vector of weights for regression
    total_weights = (in_size + 1) * num_nodes + (num_nodes + 1) * num_nodes + (num_nodes + 1) * out_size
    # sample params
    with poutine.scale(scale=ELBO_BETA):
        params = pyro.sample("params", dist.Normal(torch.zeros(total_weights + 1), 1.).to_event(1))
    weights, rho = params[:-1], params[-1]

    idx = 0
    fc1_weights = weights[idx: idx + in_size * num_nodes].reshape(num_nodes, in_size)
    idx += in_size * num_nodes
    fc1_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
    idx += num_nodes

    fc2_weights = weights[idx: idx + num_nodes * num_nodes].reshape(num_nodes, num_nodes)
    idx += num_nodes * num_nodes
    fc2_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
    idx += num_nodes

    fc3_weights = weights[idx: idx + num_nodes * out_size].reshape(out_size, num_nodes)
    idx += num_nodes * out_size
    fc3_bias = weights[idx: idx + out_size].reshape(out_size)
    idx += out_size

    assert idx == total_weights, "Something wrong with number of weights!"

    # compute forward pass
    batch_shape = x.shape[0]
    x = torch.einsum("ij, kj -> ki", fc1_weights, x) + fc1_bias[None].repeat(batch_shape, 1)
    x = torch.tanh(x)

    x = torch.einsum("ij, kj -> ki", fc2_weights, x) + fc2_bias[None].repeat(batch_shape, 1)
    x = torch.tanh(x)

    x = torch.einsum("ij, kj -> ki", fc3_weights, x) + fc3_bias[None].repeat(batch_shape, 1)
    mu = x.squeeze()

    with pyro.plate("data", x.shape[0]):
        obs = pyro.sample("obs", dist.Normal(mu, F.softplus(rho)), obs=y)
    return mu


def KDE_guide(x, y=None, weight_samples=None, in_size=D_in, num_nodes=10, out_size=1, ELBO_BETA=None):
    total_weights = (in_size + 1) * num_nodes + (num_nodes + 1) * num_nodes + (num_nodes + 1) * out_size
    iso_noise = pyro.param("iso_noise", torch.tensor(1e-3), constraint=dist.constraints.positive)
    assignment = dist.Categorical(probs=torch.ones(weight_samples.shape[0])).sample()

    # sample assigmnent
    with poutine.scale(scale=ELBO_BETA):
        params = pyro.sample("params", dist.Normal(weight_samples[assignment], iso_noise).to_event(1))

    weights, rho = params[:-1], params[-1]
    idx = 0
    fc1_weights = weights[idx: idx + in_size * num_nodes].reshape(num_nodes, in_size)
    idx += in_size * num_nodes
    fc1_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
    idx += num_nodes

    fc2_weights = weights[idx: idx + num_nodes * num_nodes].reshape(num_nodes, num_nodes)
    idx += num_nodes * num_nodes
    fc2_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
    idx += num_nodes

    fc3_weights = weights[idx: idx + num_nodes * out_size].reshape(out_size, num_nodes)
    idx += num_nodes * out_size
    fc3_bias = weights[idx: idx + out_size].reshape(out_size)
    idx += out_size

    assert idx == total_weights, "Something wrong with number of weights!"

    # compute forward pass
    batch_shape = x.shape[0]
    x = torch.einsum("ij, kj -> ki", fc1_weights, x) + fc1_bias[None].repeat(batch_shape, 1)
    x = torch.relu(x)

    x = torch.einsum("ij, kj -> ki", fc2_weights, x) + fc2_bias[None].repeat(batch_shape, 1)
    x = torch.relu(x)

    x = torch.einsum("ij, kj -> ki", fc3_weights, x) + fc3_bias[None].repeat(batch_shape, 1)
    mu = x.squeeze()


def make_empirical_gmm(samples, num_nodes, x_test):
    rho_noise = samples['params'][:, -1]
    noise = F.softplus(rho_noise)
    preds_dict = Predictive(regression_model, samples, return_sites=['_RETURN'])(x_test, None, num_nodes=num_nodes)
    preds = preds_dict['_RETURN']
    mix = dist.Categorical(torch.ones(preds.shape[0]))
    comp = dist.Normal(loc=preds.squeeze().permute(1, 0), scale=noise)
    gmm = dist.MixtureSameFamily(mix, comp)
    return gmm


In [166]:
class deterministic_regression_model(nn.Module):
    def __init__(self, params, input_size=1, num_nodes=10, output_size=1):
        super(deterministic_regression_model, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.activation = nn.Tanh()
        weights, rho = params[:-1], params[-1]

        idx = 0
        self.fc1_weights = weights[idx: idx + in_size * num_nodes].reshape(num_nodes, in_size)
        idx += in_size * num_nodes
        self.fc1_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
        idx += num_nodes

        self.fc2_weights = weights[idx: idx + num_nodes * num_nodes].reshape(num_nodes, num_nodes)
        idx += num_nodes * num_nodes
        self.fc2_bias = weights[idx: idx + num_nodes].reshape(num_nodes)
        idx += num_nodes

        self.fc3_weights = weights[idx: idx + num_nodes * out_size].reshape(out_size, num_nodes)
        idx += num_nodes * out_size
        self.fc3_bias = weights[idx: idx + out_size].reshape(out_size)
        idx += out_size
        
        self.weights = weights
        self.rho = rho
        self.params = params

        # compute forward pass
    
    def forward(self, x):
        batch_shape = x.shape[0]
        x = torch.einsum("ij, kj -> ki", self.fc1_weights, x) + self.fc1_bias[None].repeat(batch_shape, 1)
        x = torch.tanh(x)

        x = torch.einsum("ij, kj -> ki", self.fc2_weights, x) + self.fc2_bias[None].repeat(batch_shape, 1)
        x = torch.tanh(x)

        x = torch.einsum("ij, kj -> ki", self.fc3_weights, x) + self.fc3_bias[None].repeat(batch_shape, 1)
        x = x.squeeze()
        
        return x
    
    def weight_prior_lp(self):
        return dist.Normal(loc=0., scale=1.).log_prob(self.params).sum()
    
    def data_likelihood(self, x, y):
        likelihood = D.Normal(loc=self.forward(x),
                              scale=F.softplus(self.rho))
        return likelihood.log_prob(y).sum()
    
    def joint_log_prob(self, x, y):
        return self.data_likelihood(x, y) + self.weight_prior_lp(x, y)

In [183]:
pyro.set_rng_seed(10)
ELBO_BETA = 1.

in_size = x_train.shape[-1]
out_size = 1
num_nodes = 10

# run HMC
kernel = HMC(regression_model, step_size=0.001, num_steps=5, target_accept_prob=0.8)
nuts_kernel = NUTS(regression_model, step_size=0.1, target_accept_prob=0.5, max_tree_depth=5)
mcmc = MCMC(kernel, num_samples=1000, warmup_steps=1000, num_chains=1)
mcmc.run(x_train[0], y_train[0], ELBO_BETA=ELBO_BETA, num_nodes=num_nodes, in_size=D_in)

Sample: 100%|██████████| 2000/2000 [00:03, 507.66it/s, step size=1.01e-01, acc. prob=0.922]


In [184]:
full_samples = mcmc.get_samples(100)
from pyro.infer import Predictive
pred = Predictive(regression_model, full_samples, return_sites=['obs', '_RETURN'])(x_test[0], None, 
                                                                        num_nodes=num_nodes, in_size=D_in)
HMC_RMSE = ((pred['_RETURN'].mean(0) - y_test[0]) ** 2).mean().sqrt()

In [185]:
optimizer = pyro.optim.Adam({"lr": 5e-2})

# train KDE
svi = SVI(regression_model, KDE_guide, optimizer, loss=TraceMeanField_ELBO())

num_iterations = 5000
pyro.clear_param_store()
pbar = trange(num_iterations)
losses = []
for j in pbar:
    # calculate the loss and take a gradient step
    loss = svi.step(x_train[0], y_train[0], full_samples['params'], ELBO_BETA=ELBO_BETA, num_nodes=num_nodes, in_size=D_in)
    losses.append(loss)
    pbar.set_description("[iteration %04d] loss: %.4f" % (j + 1, loss / len(x_train)))

kde_noise = pyro.param("iso_noise")
flattened_params = full_samples['params']
kde_mix = dist.Categorical(probs=torch.ones(flattened_params.shape[0]))
kde_comps = dist.MultivariateNormal(loc=flattened_params,
                                    covariance_matrix=kde_noise * torch.eye(flattened_params.shape[-1]))
kde = dist.MixtureSameFamily(kde_mix, kde_comps)
prior = dist.MultivariateNormal(loc=torch.ones_like(flattened_params[0]),
                                covariance_matrix=torch.eye(flattened_params[0].shape[-1]))
kl_kde_prior = kl_estimate_with_mc(kde, prior)
kde_sample = kde.sample((50,))
kde_samples = {"params" : kde_sample}
kde_pred = Predictive(regression_model, kde_samples, return_sites=['obs', '_RETURN'])(x_test[0], None, 
                                                                        num_nodes=num_nodes, in_size=D_in)
KDE_RMSE = ((kde_pred['_RETURN'].mean(0) - y_test[0]) ** 2).mean().sqrt()

  0%|          | 0/5000 [00:00<?, ?it/s]

In [186]:
kde_sample = kde.sample((500,))
kde_samples = {"params" : kde_sample}
kde_pred = Predictive(regression_model, kde_samples, return_sites=['obs', '_RETURN'])(x_test[0], None, 
                                                                        num_nodes=num_nodes, in_size=D_in)
KDE_RMSE = ((kde_pred['_RETURN'].mean(0) - y_test[0]) ** 2).mean().sqrt()

In [187]:
# train Factored Gaussian approx
from pyro.infer.autoguide import AutoDiagonalNormal
guide = AutoDiagonalNormal(regression_model)
svi = SVI(regression_model, guide, optimizer, loss=TraceMeanField_ELBO())
num_iterations = 5000
pyro.clear_param_store()
pbar = trange(num_iterations)
losses = []
for j in pbar:
    # calculate the loss and take a gradient step
    loss = svi.step(x_train[0], y_train[0], ELBO_BETA=ELBO_BETA, num_nodes=num_nodes, in_size=D_in)
    losses.append(loss)
    pbar.set_description("[iteration %04d] loss: %.4f" % (j + 1, loss / len(x_train)))
guide.requires_grad_(False)

params = []
for name, value in pyro.get_param_store().items():
    params.append(pyro.param(name))

means, stds = params
variational_posterior = dist.MultivariateNormal(loc=means, covariance_matrix=torch.diag(stds ** 2))
variational_sample = variational_posterior.sample((50,))
variational_samples = {"params" : variational_sample}
kl_var_prior = kl_estimate_with_mc(variational_posterior, prior)
var_pred = Predictive(regression_model, variational_samples, return_sites=['obs', '_RETURN'])(x_test[0], None, 
                                                                        num_nodes=num_nodes, in_size=D_in)
VAR_RMSE = ((var_pred['_RETURN'].mean(0) - y_test[0]) ** 2).mean().sqrt()

  0%|          | 0/5000 [00:00<?, ?it/s]

In [188]:

means, stds = params
variational_posterior = dist.MultivariateNormal(loc=means, covariance_matrix=torch.diag(stds ** 2))
variational_sample = variational_posterior.sample((500,))
variational_samples = {"params" : variational_sample}
kl_var_prior = kl_estimate_with_mc(variational_posterior, prior)
var_pred = Predictive(regression_model, variational_samples, return_sites=['obs', '_RETURN'])(x_test[0], None, 
                                                                        num_nodes=num_nodes, in_size=D_in)
VAR_RMSE = ((var_pred['_RETURN'].mean(0) - y_test[0]) ** 2).mean().sqrt()

In [189]:
hmc_gmm = make_empirical_gmm(full_samples, num_nodes, x_test[0])
kde_gmm = make_empirical_gmm(kde_samples, num_nodes, x_test[0])
var_gmm = make_empirical_gmm(variational_samples, num_nodes, x_test[0])
print(f"The final KLs are: KDE {kl_kde_prior}, VAR {kl_var_prior}\n"
      f"The final RMSE are: HMC {HMC_RMSE}, KDE {KDE_RMSE}, VAR {VAR_RMSE}\n"
      f"The final LLs are: HMC {hmc_gmm.log_prob(y_test[0]).sum()}, KDE {kde_gmm.log_prob(y_test[0]).sum()}, VAR {var_gmm.log_prob(y_test[0]).sum()}.")

The final KLs are: KDE 476.938232421875, VAR 344.455322265625
The final RMSE are: HMC 0.6507837772369385, KDE 0.68092280626297, VAR 0.6157970428466797
The final LLs are: HMC -523.5782470703125, KDE -641.4970703125, VAR -500.2248840332031.


In [190]:
print(means.shape[0])

242


# Compress weights

In [38]:
#lets compress some samples
#### sample weights with compression algorithm
from tqdm.notebook import trange
from rec.beamsearch.Coders.Encoder_Empirical import Encoder
from rec.beamsearch.distributions.CodingSampler import CodingSampler
from rec.beamsearch.distributions.EmpiricalMixturePosterior import EmpiricalMixturePosterior
from rec.beamsearch.samplers.GreedySampling_BNNs import GreedySampler
import pyro.distributions as dist