In [51]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd

General case:
- Choose a prior for $Z$: $p(Z)$.
- Choose an observation model: $p_\theta(X|Z)$
- Choose a variational posterior: $q_{\gamma}(\mathbf{z} | \mathbf{x})$

- Choose a missing model: $p_{\phi}(\mathbf{S} | \mathbf{X^o, X^m})$


The ELBO in the MNAR case is

$$ E_{(\mathbf{z}_1, \mathbf{x}_1^m)...(\mathbf{z}_K, \mathbf{x}_K^m)} \left[ \log \frac{1}{K} \sum_{k=1}^K \frac{p_{\phi}(\mathbf{s} | \mathbf{x}^o, \mathbf{x}_k^m) p_{\theta}(\mathbf{x}^o | \mathbf{z}_k) p(\mathbf{z}_k)}{q_{\gamma}(\mathbf{z} | \mathbf{x}^o)} \right]$$

### Classic case
The model we are building has a Gaussian prior and a Gaussian observation model (also the decoder ($z \rightarrow x$) ),

$$ p(\mathbf{z}) = \mathcal{N}(\mathbf{z} | \mathbf{0}, \mathbf{I})$$

$$ p_\theta(\mathbf{x} | \mathbf{z}) = \mathcal{N}(\mathbf{x} | \mathbf{\mu}_{\theta}(\mathbf{z}), \sigma^2\mathbf{I})$$

$$ p_\theta(\mathbf{x}) = \int p_\theta(\mathbf{x} | \mathbf{z})p(\mathbf{z}) d\mathbf{z}$$

where $\mathbf{\mu}_{\theta}(\mathbf{z}): \mathbb{R}^d \rightarrow \mathbb{R}^p $ in general is a deep neural net, but in this case is a linear mapping, $\mathbf{\mu} = \mathbf{Wz + b}$.

The variational posterior (also the encoder ($x \rightarrow z$) ) is also Gaussian

$$q_{\gamma}(\mathbf{z} | \mathbf{x}) = \mathcal{N}(\mathbf{z} | \mu_{\gamma}(\mathbf{x}), \sigma_{\gamma}(\mathbf{x})^2 \mathbf{I})$$

If the missing process is *missing at random*, it is ignorable and the ELBO becomes, as described in [the MIWAE paper](https://arxiv.org/abs/1812.02633)

$$ E_{\mathbf{z}_1...\mathbf{z}_K} \left[ \log \frac{1}{K}\sum_{k=1}^K \frac{p_{\theta}(\mathbf{x^o} | \mathbf{z}_k)p(\mathbf{z}_k)}{q_{\gamma}(\mathbf{z}_k | \mathbf{x^o})} \right] $$

When the missing process is MNAR it is non-ignorable and we need to include the missing model. In this example we include the missing model as a logistic regression in each feature dimension

$$ p_{\phi}(\mathbf{s} | \mathbf{x^o, x^m}) = \text{Bern}(\mathbf{s} | \pi_{\phi}(\mathbf{x^o, x^m}))$$

$$ \pi_{\phi, j}(x_j) = \frac{1}{1 + e^{-\text{logits}_j}} $$

$$ \text{logits}_j = W_j (x_j - b_j) $$

The ELBO in the MNAR case becomes

$$ E_{(\mathbf{z}_1, \mathbf{x}_1^m)...(\mathbf{z}_K, \mathbf{x}_K^m)} \left[ \log \frac{1}{K} \sum_{k=1}^K \frac{p_{\phi}(\mathbf{s} | \mathbf{x}^o, \mathbf{x}_k^m) p_{\theta}(\mathbf{x}^o | \mathbf{z}_k) p(\mathbf{z}_k)}{q_{\gamma}(\mathbf{z} | \mathbf{x}^o)} \right]$$

with $ z \sim q_{\gamma}(z|x^o), x^m\sim p_\theta(x^m|z)$

### Constant to define

 - $K$ = $n_{\text{samples}}$ the number of sample to estimate the expectation
 - $n_{\text{latent}}$ the dimension of the latent space where $z$ lives


### Load data
Here we use the white-wine dataset from the UCI database

In [52]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
data = np.array(pd.read_csv(url, low_memory=False, sep=';'))
# ---- drop the classification attribute
data = data[:, :-1]

### Settings

In [53]:
N, D = data.shape
n_latent = D - 1
n_hidden = 128
n_samples = 20
max_iter = 30000
batch_size = 16

### Standardize data

In [54]:
# ---- standardize data
data = data - np.mean(data, axis=0)
data = data / np.std(data, axis=0)

# ---- random permutation
p = np.random.permutation(N)
data = data[p, :]

# ---- we use the full dataset for training here, but you can make a train-val split
Xtrain = data.copy()
Xval = Xtrain.copy()

### Introduce missing 
Here we denote
- Xnan: data matrix with np.nan as the missing entries
- Xz: data matrix with 0 as the missing entries
- S: missing mask 

The missing process depends on the missing data itself:
- in half the features, set the feature value to missing when it is higher than the feature mean

In [55]:
# ---- introduce missing process
Xnan = Xtrain.copy()
Xz = Xtrain.copy()

mean = np.mean(Xnan[:, :int(D / 2)], axis=0)
ix_larger_than_mean = Xnan[:, :int(D / 2)] > mean

Xnan[:, :int(D / 2)][ix_larger_than_mean] = np.nan
Xz[:, :int(D / 2)][ix_larger_than_mean] = 0

S = np.array(~np.isnan(Xnan), dtype=np.float32)

In [72]:
class notMIWAE(nn.Module):
    #Only Gaussian and Bern for the moment
    def __init__(self, input_size = 10, n_latent = 20, n_samples = 10):
        super(notMIWAE, self).__init__()

        self.n_input = input_size
        self.n_latent = n_latent
        self.n_samples = n_samples

        self.encoder_mu = nn.Linear(in_features=input_size, out_features=n_latent)
        self.encoder_logsigma = nn.Linear(in_features=input_size, out_features=n_latent)
        
        self.decoder_mu = nn.Linear(in_features=n_latent, out_features=input_size)
        self.decoder_logsigma = nn.Linear(in_features=n_latent, out_features=input_size)

        # Missing mechanism
        self.logits = nn.Linear(in_features=input_size, out_features=input_size)

        self.sigma = torch.ones(n_latent)
        

    def elbo(self, x, s):
        
        z_mu = self.encoder_mu(x)
        z_sigma = torch.exp(self.encoder_logsigma(x))

        law_z_given_x= torch.distributions.normal.Normal(loc = z_mu, scale = z_sigma)

        z_samples = law_z_given_x.sample((self.n_samples,1)).squeeze()

        #check size probably need to transpose
        print(z_samples.shape)
        log_prob_z_given_x = law_z_given_x.log_prob(z_samples)
        print( log_prob_z_given_x.shape)
        law_z = torch.distributions.normal.Normal(loc = 0., scale = 1.)

        log_prob_z = law_z.log_prob(z_samples)



        x_mu = self.decoder_mu(z_samples)
        x_sigma = torch.exp(self.decoder_logsigma(z_samples))

        law_x_given_z = torch.distributions.normal.Normal(loc = x_mu, scale = x_sigma)

        x_samples  = law_x_given_z.sample((self.n_samples,1)).squeeze().transpose(0,1)

        log_prob_x_given_z = law_x_given_z.log_prob(x_samples)

        
        # Missing mechanism
        # We recreate the x_sample using the real x we know (x_o) and the x_samples we created from z (x_m).
        mixed_x_samples = x_samples * s.unsqueeze(-1) + (x*(1-s)).unsqueeze(-1)
        logits = self.logits(mixed_x_samples)

        law_s_given_x = torch.distributions.bernoulli.Bernoulli(logits=logits)

        log_prob_s_given_x = law_s_given_x.log_prob(mixed_x_samples)
        
        
        log_sum_w = torch.logsumexp(log_prob_s_given_x + log_prob_x_given_z + log_prob_z - log_prob_z_given_x, dim = 1) # dim = ?
        log_mean_w = log_sum_w - torch.log(self.n_samples)

        return log_mean_w.sum()



In [73]:
N, p = Xtrain.shape

X = torch.Tensor(Xz)
S = torch.Tensor(S)
batch_size = 100
epochs = 5
model = notMIWAE(input_size=p)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for _ in range(epochs):
    for i in range(0,N,batch_size):
        X_batch = X[i:(i+batch_size)]
        S_batch = S[i:(i+batch_size)]

        elbo = model.elbo(X_batch,S_batch)
        optimizer.zero_grad()
        elbo.backward()
        optimizer.step()
    print(elbo)
        
        


torch.Size([10, 100, 20])
torch.Size([10, 100, 20])


RuntimeError: The size of tensor a (100) must match the size of tensor b (11) at non-singleton dimension 2

In [81]:
law_z_given_x= torch.distributions.normal.Normal(loc = torch.zeros((3,5)), scale = torch.ones((3,5)))

z_samples = law_z_given_x.sample((1,1)).squeeze()

#check size probably need to transpose
print(z_samples)
log_prob_z_given_x = law_z_given_x.log_prob(z_samples)


tensor([[ 0.7964,  0.9837, -0.1394,  0.5177, -0.8972],
        [ 2.6495, -0.2887,  0.0622, -0.1621,  0.2745],
        [ 0.0130,  0.6976,  1.3022,  0.1899, -0.8932]])


In [82]:
print( log_prob_z_given_x)

tensor([[-1.2360, -1.4028, -0.9287, -1.0529, -1.3214],
        [-4.4289, -0.9606, -0.9209, -0.9321, -0.9566],
        [-0.9190, -1.1623, -1.7668, -0.9370, -1.3179]])


In [84]:
a = np.array([ 0.7964,  0.9837, -0.1394,  0.5177, -0.8972])
-0.7964**2 / 2 - 0.5 * np.log(2 * np.pi) 

-1.2360650132046727