# Session 2: Variational Deep Embedding (VaDE)

### 0: Mount your Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive




---



### Getting Started

Reusing the VAE class defined in last week's practical session.

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

class VAE(nn.Module):
    def __init__(self, x_dim, h_dim1, h_dim2, z_dim):
        super(VAE, self).__init__()
        
        # define your variables here
        # encoder part
        self.fc1 = nn.Linear(x_dim, h_dim1)
        self.fc2 = nn.Linear(h_dim1, h_dim2)
        self.fc31 = nn.Linear(h_dim2, z_dim)
        self.fc32 = nn.Linear(h_dim2, z_dim)
        # decoder part
        self.fc4 = nn.Linear(z_dim, h_dim2)
        self.fc5 = nn.Linear(h_dim2, h_dim1)
        self.fc6 = nn.Linear(h_dim1, x_dim)
        
    def encoder(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        return self.fc31(h), self.fc32(h) # mu, log_var
    
    def sampling(self, mu, log_var):
        std = torch.exp(0.5*log_var)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu) # return z sample
        
    def decoder(self, z):
        h = F.relu(self.fc4(z))
        h = F.relu(self.fc5(h))
        return F.sigmoid(self.fc6(h)) 
    
    def forward(self, x):
        mu, log_var = self.encoder(x.view(-1, 784))
        z = self.sampling(mu, log_var)
        return self.decoder(z), mu, log_var


Upload the `vae_model_object.pt` model object provided under the directory 'Session_02/Utils/vae_model_object.pt' to your Google Drive and replace its link for the variable `path_to_model` below.

In [4]:
device = 'cuda'

path_to_model = "/content/gdrive/MyDrive/AISC/vae_model_object.pt"  # to be replaced by the user
pretrained_vae = torch.load(path_to_model)

### Task 1: Pretraining and setting up GMM

The dataloader will depend on the data you were working with last week. I recommend starting with MNIST and then experimenting once you have the initial pipeline running.

In [5]:
from torchvision import datasets, transforms

# Function to load MNIST
# You may decide to extend this to include a text string as a parameter and gather 
# the appropriate dataset
def get_dataset(batch_size=50):
  
  # MNIST Dataset
  train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transforms.ToTensor(), download=True)
  test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transforms.ToTensor(), download=False)

  # Data Loader (Input Pipeline)
  train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader, test_loader


Let get started with a pretrain function which will initiate the variabled we need for VaDE:

In [6]:
from sklearn.mixture import GaussianMixture

class VaDE(nn.Module):
  def __init__(self, num_clusters = 2, device = 'cuda', vae_model = None):
    super(VaDE, self).__init__()

    self.num_clusters = num_clusters
    self.device = device

    # define some parameters for VaDE 
    self.pi = None
    self.mu_c = None
    self.log_c = None

    self.vae = vae_model
    self.vae.to(self.device)
    self.optimizer = torch.optim.Adam(vae_model.parameters(), lr=0.001)

  def pretrain(self, train_data):
    # Define the following steps within this function:
    # 1 - Use your pretrained VAE to gather encodings (z) for training set
    # 2 - Train a GMM with an array of z's 
    # 3 - set self.pi, self.mu_c, and self.log_c here using parameters from GMM

    Z = []
    dataloader = train_data

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(self.device)

            z_mean, z_log = self.vae.encoder(x.view(-1, 784))
            Z.append(z_mean)

    Z = torch.cat(Z, 0).detach().cpu().numpy()

    gmm = GaussianMixture(n_components=self.num_clusters, covariance_type='diag')
    pre = gmm.fit(Z)

    self.pi_ = torch.from_numpy(gmm.weights_).cuda().float()
    self.mu_c = torch.from_numpy(gmm.means_).cuda().float()
    self.log_c = torch.log(torch.from_numpy(gmm.covariances_).cuda().float())

    return 


**The above code will be the starting point for our VaDE class. Get comfortable with it before proceeding further!**

### Task 2: Define the ELBO loss function

There are two approaches that you may use for the implementation of the VaDE loss function:


1.   **Recommendation**: Start with the original Tensorflow implementation of VaDE loss function available here and translate it to PyTorch: https://github.com/slim1017/VaDE/blob/master/VaDE.py 
2.   **Advanced**: Start with the paper and theoretical background behind VaDE and implement the loss function from scratch


I don't mind which approach you take above. Either way you will learn how the loss function operates. If you are unfamiliar with PyTorch, the recommended approach is more suitable

In [7]:
import numpy as np

def gaussian_pdf(mu_c, log_c, z):
  g = []
  for i in range(mu_c.shape[0]):
    m = -0.5 * (torch.sum(np.log(float(np.pi*2)) + 
                          log_c[i:i+1] + (z - mu_c[i:i+1]).pow(2)/torch.exp(log_c[i:i+1]),1))
    g.append(m.view(-1, 1))
  return torch.cat(g, 1)
  
def loss_function(recon_x, x, mu_z, log_z, pi, mu_c, log_c, num_clusters=2):
  """
  Definitions of parameters:
  - recon_x: output from decoder (ie reconstructed x)
  - x: original input
  - mu_z: mean component of latent space returned from encoder
  - log_z: log variance component of latent space returned from encoder
  - pi, mu_c, log_c: all parameters learned in our GMM
  - num_clusters: number of clusters defined in our model
  """

  # Compute VaDE loss
  # 1 - compute cross entropy
  ce = F.binary_cross_entropy(recon_x, x)

  # Sample as you would do in VAE
  z = torch.randn_like(mu_z) * torch.exp(log_z / 2) + mu_z  

  # 2 - Choose a sample x from Gaussian distribution (N in paper) for *each* cluster
  g = gaussian_pdf(mu_c, log_c, z)

  # 3 - Implement q(c|x) here 
  y_c = torch.exp(torch.log(pi.unsqueeze(0)) + g) + 1e-10
  y_c = y_c/(y_c.sum(1).view(-1,1)) 
  #print(y_c)

  # 4 - Putting it all together
  # Here we are following equation 12 and plugging all the mathematical functions in 
  # When implementing this, flatten out log_c, mu_c, log_z and mu_z
  # It will make combining them with batch log_z much easier
  inter_c = torch.sum(y_c * torch.sum(log_c.unsqueeze(0) + torch.exp(log_z.unsqueeze(1) - log_c.unsqueeze(0)) +
                                                    (mu_z.unsqueeze(1) - mu_c.unsqueeze(0)).pow(2) / torch.exp(log_c.unsqueeze(0)), 1),1)

  pc1 = torch.sum(y_c * torch.log(pi.unsqueeze(0) / (y_c)), 1)
  pc2 = torch.sum(1 + log_z,1)
  #print(torch.mean(inter_c), torch.mean(pc1), torch.mean(pc2))
  
  loss = ce + (0.5 * torch.mean(inter_c)) - torch.mean(pc1) - (0.5 * torch.mean(pc2))
  #print(loss)
  return loss


### Task 3: Train VaDE

Now we will be reusing some code you created last week for train and test functions...

In [8]:
from sklearn.mixture import GaussianMixture
from torch.distributions.log_normal import LogNormal

class VaDE(nn.Module):
  def __init__(self, num_clusters = 2, device = 'cuda', vae_model = None):
    super(VaDE, self).__init__()

    self.num_clusters = num_clusters
    self.device = device
    self.data_type = 'sigmoid'

    # get the size of the latent space from pretrained VAE
    hidden_dim = vae_model.fc31.out_features

    # define some parameters for VaDE
    # requires_grad makes sure these parameters update during training
    self.pi_ = nn.Parameter(torch.FloatTensor(num_clusters,).fill_(1)/num_clusters,requires_grad=True)
    self.mu_c = nn.Parameter(torch.FloatTensor(num_clusters,hidden_dim).fill_(0),requires_grad=True)
    self.log_c = nn.Parameter(torch.FloatTensor(num_clusters,hidden_dim).fill_(0),requires_grad=True)

    # Save the loaded VAE for later
    self.vae = vae_model
    self.vae.to(self.device)

    # Define the optimizer for training later
    self.optimizer = torch.optim.Adam(vae_model.parameters(), lr=0.001)

  def pretrain(self, train_data):
    # Define the following steps within this function:
    # 1 - Use your pretrained VAE to gather encodings (z) for training set
    # 2 - Train a GMM with an array of z's 
    # 3 - set self.pi, self.mu_c, and self.log_c here using parameters from GMM

    Z = []
    dataloader = train_data

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(self.device)

            z_mean, z_log = self.vae.encoder(x.view(-1, 784))
            Z.append(z_mean)

    Z = torch.cat(Z, 0).detach().cpu().numpy()

    gmm = GaussianMixture(n_components=self.num_clusters, covariance_type='diag')
    pre = gmm.fit(Z)

    self.pi_.data = torch.from_numpy(gmm.weights_).cuda().float()
    self.mu_c.data = torch.from_numpy(gmm.means_).cuda().float()
    self.log_c.data = torch.log(torch.from_numpy(gmm.covariances_).cuda().float())

    return 

  def train(self, epoch, train_loader):
    self.vae.to(self.device)
    self.vae.train()

    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(self.device)  
        data = data.view(-1, 784)  
        self.optimizer.zero_grad()
        
        # Call your model and loss functions here
        recon_x, mu, log_var = self.vae(data)
        loss = loss_function(recon_x, data, mu, log_var, self.pi_, self.mu_c, self.log_c, self.num_clusters)
        
        loss.backward()
        train_loss += loss.item()
        self.optimizer.step()
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item() / len(data)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(train_loader.dataset)))
  
  def test(self, test_loader):
    self.vae.to(self.device)
    self.vae.eval()

    test_loss= 0
    C =  []
    with torch.no_grad():
        for data, _ in test_loader:
            data = data.to(self.device)
            recon, mu, log_var = self.vae(data)
            
            #Gather cluster label - similar to loss function
            z = torch.randn_like(mu) * torch.exp(log_var / 2) + mu
            g = gaussian_pdf(self.mu_c, self.log_c, z)
            y_c = torch.exp(torch.log(self.pi_.unsqueeze(0)) + g)

            y_c = y_c.detach().cpu().numpy()
            C.extend(np.argmax(y_c,axis=1))

    return C
 

Start by calling the pretrain function

In [9]:
# Note: you must use the same number of clusters as hidden units in latent space
# This is one of the conditions in VaDE
vade_model = VaDE(num_clusters = 2, vae_model = pretrained_vae, device='cuda')
train_data, test_data = get_dataset()

# Pretrain our VaDE architecture and gather GMM parameters
vade_model.pretrain(train_data)
print("Pretrained model.")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Pretrained model.


Now we call the train function which will subsequently call your loss function. **If there are errors in the loss function, they will pop up here**

In [10]:
# Now train the entire VaDE architecture
num_epochs = 20
for epoch in range(1, num_epochs + 1):
    vade_model.train(epoch, train_data)

clusters = vade_model.test(test_data)



====> Epoch: 1 Average loss: 0.0125
====> Epoch: 2 Average loss: 0.0083
====> Epoch: 3 Average loss: 0.0083
====> Epoch: 4 Average loss: 0.0083
====> Epoch: 5 Average loss: 0.0083
====> Epoch: 6 Average loss: 0.0082
====> Epoch: 7 Average loss: 0.0082
====> Epoch: 8 Average loss: 0.0082
====> Epoch: 9 Average loss: 0.0082
====> Epoch: 10 Average loss: 0.0082
====> Epoch: 11 Average loss: 0.0082
====> Epoch: 12 Average loss: 0.0082
====> Epoch: 13 Average loss: 0.0083
====> Epoch: 14 Average loss: 0.0082
====> Epoch: 15 Average loss: 0.0082
====> Epoch: 16 Average loss: 0.0082
====> Epoch: 17 Average loss: 0.0082
====> Epoch: 18 Average loss: 0.0082
====> Epoch: 19 Average loss: 0.0082
====> Epoch: 20 Average loss: 0.0082


In [11]:
print(clusters)

[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [12]:
torch.save(vade_model.state_dict(), "/content/gdrive/MyDrive/AISC/vade_model.pt")

### Optional Extra: Visualize your clusters

If you manage to complete your model and have something  I encourage you to:


1.   **Evaluation Metrics:** Use the built-in scipy functions to validate the clusters you have created. Refer to the lecture material for some metrics you may use.
2. Vizualize the clusters you have created. You may choose to use the Embedded Projector to do this

