In [1]:
import pyro
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

from RAVDESS_dataset_util import *
from EmoClassCNN import *

torch.set_default_dtype(torch.float64)

In [2]:
folder_path = '/home/studenti/ballerini/datasets/RAVDESS_frames'

In [3]:
NUM_CLASSES = len(emocat)
IMG_SIZE = 100
BATCH_SIZE = 8
DEFAULT_Z_DIM = 50

face_dataset = FaceEmotionDataset(root_dir=folder_path,
                                    transform=transforms.Compose([
                                        Rescale(IMG_SIZE), 
                                        CenterCrop(IMG_SIZE), 
                                        ToTensor()
                                    ]))        

trainingset_len = len(face_dataset) // 100 * 10
testset_len = len(face_dataset) - trainingset_len

print('training set size: ', trainingset_len)
print('test set size: ', testset_len)

train_set, test_set = torch.utils.data.random_split(face_dataset, 
                                                    [trainingset_len, testset_len], 
                                                    generator=torch.Generator().manual_seed(42)
                                                   )

trainset_loader = DataLoader(train_set, batch_size=BATCH_SIZE,
                        shuffle=True, num_workers=4)

testset_loader = DataLoader(test_set, batch_size=BATCH_SIZE,
                        shuffle=True, num_workers=4)

dataset_loader = (trainset_loader, testset_loader)

training set size:  720
test set size:  6480


In [4]:
def emotion_rating_conversion(cat):
    ratings = torch.zeros(NUM_CLASSES)
    ratings[cat] = 1
    return ratings
    
#torch.argmax(emotion_rating_conversion(3))

In [5]:
# helper functions
class Swish(nn.Module):
    """https://arxiv.org/abs/1710.05941"""
    def forward(self, x):
        return x * torch.sigmoid(x)

def swish(x):
    return x * torch.sigmoid(x)

In [6]:
class ProductOfExperts(nn.Module):
    """
    Return parameters for product of independent experts.
    See https://arxiv.org/pdf/1410.7827.pdf for equations.

    @param loc: M x D for M experts
    @param scale: M x D for M experts
    """
    def forward(self, loc, scale, eps=1e-8):
        scale = scale + eps # numerical constant for stability
        # precision of i-th Gaussian expert (T = 1/sigma^2)
        T = 1. / scale
        product_loc = torch.sum(loc * T, dim=0) / torch.sum(T, dim=0)
        product_scale = 1. / torch.sum(T, dim=0)
        return product_loc, product_scale

In [7]:
class ImageEncoder(nn.Module):
    """
    define the PyTorch module that parametrizes q(z|image).
    This goes from images to the latent z
    
    This is the standard DCGAN architecture.

    @param z_dim: integer
                  size of the tensor representing the latent random variable z
    """
    def __init__(self, z_dim):
        super(ImageEncoder, self).__init__()
        #torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, 
        #                padding=0, dilation=1, groups=1, bias=True)
        # H_out = floor( (H_in + 2*padding - dilation(kernel_size-1) -1) / stride    +1)
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 4, 2, 1, bias=False),
            Swish(),
            nn.Conv2d(32, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            Swish(),
            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            Swish(),
            nn.Conv2d(128, 256, 4, 1, 0, bias=False),
            nn.BatchNorm2d(256),
            Swish())
        
        # Here, we define two layers, one to give z_loc and one to give z_scale
        self.z_loc_layer = nn.Sequential(
            nn.Linear(256 * 9 * 9, 512), # it's 256 * 9 * 9 if input is 100x100.
            Swish(),
            nn.Dropout(p=0.1),
            nn.Linear(512, z_dim))
        
        self.z_scale_layer = nn.Sequential(
            nn.Linear(256 * 9 * 9, 512), # it's 256 * 9 * 9 if input is 100x100.
            Swish(),
            nn.Dropout(p=0.1),
            nn.Linear(512, z_dim))
        self.z_dim = z_dim

    def forward(self, image):
        hidden = self.features(image)
        print(hidden.shape)
        image = image.view(-1, 256 * 9 * 9) # it's 256 * 9 * 9 if input is 100x100.
        z_loc = self.z_loc_layer(hidden)
        z_scale = torch.exp(self.z_scale_layer(hidden)) #add exp so it's always positive
        return z_loc, z_scale
    
class ImageDecoder(nn.Module):
    """
    define the PyTorch module that parametrizes p(image|z).
    This goes from the latent z to the images
    
    This is the standard DCGAN architecture.

    @param z_dim: integer
                  size of the tensor representing the latent random variable z
    """
    def __init__(self, z_dim):
        super(ImageDecoder, self).__init__()
        self.upsample = nn.Sequential(
            nn.Linear(z_dim, 256 * 9 * 9),  # it's 256 * 9 * 9 if input is 100x100.
            Swish())
        
        self.hallucinate = nn.Sequential(
            nn.ConvTranspose2d(256, 128, 4, 1, 0, bias=False),
            nn.BatchNorm2d(128),
            Swish(),
            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            Swish(),
            nn.ConvTranspose2d(64, 32, 4, 2, 1, bias=False),
            nn.BatchNorm2d(32),
            Swish(),
            nn.ConvTranspose2d(32, 3, 4, 2, 1, bias=False))

    def forward(self, z):
        # the input will be a vector of size |z_dim|
        z = self.upsample(z)
        z = z.view(-1, 256, 9, 9) # it's 256 * 9 * 9 if input is 100x100.
        # but if 100x100, the output image size is 96x96
        image = self.hallucinate(z) # this is the image
        return image  # NOTE: no sigmoid here. See train.py

In [8]:
class EmotionEncoder(nn.Module):
    """
    define the PyTorch module that parametrizes q(z|emotion category).
    This goes from ratings to the latent z

    @param z_dim: integer
                  size of the tensor representing the latent random variable z
    """
    def __init__(self, z_dim):
        super(EmotionEncoder, self).__init__()
        self.net = nn.Linear(NUM_CLASSES, 512)
        
        self.z_loc_layer = nn.Sequential(
            nn.Linear(512, 512),
            Swish(),
            nn.Linear(512, z_dim))
        
        self.z_scale_layer = nn.Sequential(
            nn.Linear(512, 512),
            Swish(),
            nn.Linear(512, z_dim))
        self.z_dim = z_dim

    def forward(self, emocat):
        hidden = self.net(emocat)
        z_loc = self.z_loc_layer(hidden)
        z_scale = torch.exp(self.z_scale_layer(hidden))
        return z_loc, z_scale


class EmotionDecoder(nn.Module):
    """
    define the PyTorch module that parametrizes p(emotion category|z).
    This goes from the latent z to the ratings

    @param z_dim: integer
                  size of the tensor representing the latent random variable z
    """
    def __init__(self, z_dim):
        super(EmotionDecoder, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim, 512),
            Swish())
        
        self.emotion_loc_layer = nn.Sequential(
            nn.Linear(512, 512),
            Swish(),
            nn.Linear(512, len(emocat)))
        
        self.emotion_scale_layer = nn.Sequential(
            nn.Linear(512, 512),
            Swish(),
            nn.Linear(512, NUM_CLASSES))

    def forward(self, z):
        #batch_size = z.size(0)
        hidden = self.net(z)
        emotion_loc = self.emotion_loc_layer(hidden)
        emotion_scale = torch.exp(self.emotion_scale_layer(hidden))
        # rating is going to be a |emotions| * 9 levels
        #rating = h.view(batch_size, EMOTION_VAR_DIM, 9)
        return emotion_loc, emotion_scale  # NOTE: no softmax here. See train.py

In [9]:
class MVAE(nn.Module):
    """
    This class encapsulates the parameters (neural networks), models & guides needed to train a
    multimodal variational auto-encoder.
    Modified from https://github.com/mhw32/multimodal-vae-public
    Multimodal Variational Autoencoder.

    @param z_dim: integer
                  size of the tensor representing the latent random variable z
                  
    Currently all the neural network dimensions are hard-coded; 
    in a future version will make them be inputs into the constructor
    """
    def __init__(self, z_dim, use_cuda=True):
        super(MVAE, self).__init__()
        self.z_dim = z_dim
        self.image_encoder = ImageEncoder(z_dim)
        self.image_decoder = ImageDecoder(z_dim)
        self.emotion_encoder = EmotionEncoder(z_dim)
        self.emotion_decoder =EmotionDecoder(z_dim)
        
        self.use_cuda = use_cuda
        # relative weights of losses in the different modalities
        self.LAMBDA_IMAGES = 1.0
        self.LAMBDA_RATINGS = 50.0
        self.LAMBDA_OUTCOMES = 100.0
        
        # using GPUs for faster training of the networks
        if self.use_cuda:
            self.cuda()
            
    def model(self, images=None, emotions=None, annealing_beta=1.0):
        # register this pytorch module and all of its sub-modules with pyro
        pyro.module("mvae", self)
        
        batch_size = 0
        if images is not None:
            batch_size = images.size(0)
        elif emotions is not None:
            batch_size = emotions.size(0)
        
        with pyro.plate("data"):      
            
            # sample the latent z from the (constant) prior, z ~ Normal(0,I)
            z_prior_mean  = torch.zeros(size=[BATCH_SIZE, self.z_dim])
            z_prior_scale = torch.exp(torch.zeros(size=[BATCH_SIZE, self.z_dim]))                
            
            # sample from prior (value will be sampled by guide when computing the ELBO)
            with poutine.scale(scale=annealing_beta):
                z = pyro.sample("z", dist.Normal(z_loc, z_scale))

            # decode the latent code z (image decoder)
            img_loc = self.image_decoder.forward(z)
            
            # score against actual images
            if images is not None:
                with poutine.scale(scale=self.LAMBDA_IMAGES):
                    pyro.sample("obs_img", dist.Bernoulli(img_loc), obs=images)
            
            # decode the latent code z (emotion decoder)
            emotion_loc, emotion_scale = self.emotion_decoder.forward(z)
            if categories is not None:
                with poutine.scale(scale=self.LAMBDA_RATINGS):
                    pyro.sample("obs_emotion", 
                                dist.Normal(emotions_loc, emotions_scale), 
                                obs=emotion_rating_conversion(emotions))

            # return the loc so we can visualize it later
            return img_loc, emotion_loc
        
    def guide(self, images=None, emotions=None, annealing_beta=1.0):
        # register this pytorch module and all of its sub-modules with pyro
        pyro.module("mvae", self)
        
        batch_size = 0
        if images is not None:
            batch_size = images.size(0)
        elif emotions is not None:
            batch_size = emotions.size(0)
            
        with pyro.plate("data"):
            # use the encoder to get the parameters used to define q(z|x)
                        
            # initialize the prior expert.
            # we initalize an additional dimension, along which we concatenate all the 
            #   different experts.
            # self.experts() then combines the information from these different modalities
            #   by multiplying the gaussians together
            
            z_loc = torch.zeros(torch.Size((1, batch_size, self.z_dim))) + 0.5
            z_scale = torch.ones(torch.Size((1, batch_size, self.z_dim))) * 0.1
            if self.use_cuda:
                z_loc, z_scale = z_loc.cuda(), z_scale.cuda()
                
            if images is not None:
                image_z_loc, image_z_scale = self.image_encoder.forward(images)
                z_loc = torch.cat((z_loc, image_z_loc.unsqueeze(0)), dim=0)
                z_scale = torch.cat((z_scale, image_z_scale.unsqueeze(0)), dim=0)
            
            if emotions is not None:
                emotion_z_loc, emotion_z_scale = self.emotion_encoder.forward(emotions)
                z_loc = torch.cat((z_loc, rating_z_loc.unsqueeze(0)), dim=0)
                z_scale = torch.cat((z_scale, rating_z_scale.unsqueeze(0)), dim=0)
            
            z_loc, z_scale = self.experts(z_loc, z_scale)
            # sample the latent z
            with poutine.scale(scale=annealing_beta):
                pyro.sample("latent", dist.Normal(z_loc, z_scale))
                
                
    def forward(self, image=None, emotion=None):
        z_loc, z_scale  = self.infer(image, emotion)
        z = pyro.sample("latent", dist.Normal(z_loc, z_scale).independent(1))
        # reconstruct inputs based on that gaussian
        image_recon = self.image_decoder(z)
        rating_recon = self.emotion_decoder(z)
        return image_recon, rating_recon, z_loc, z_scale
    
    
    def infer(self, images=None, emotions=None):
        batch_size = 0
        if images is not None:
            batch_size = images.size(0)
        elif emotions is not None:
            batch_size = emotions.size(0)
            
        # initialize the prior expert
        # we initalize an additional dimension, along which we concatenate all the 
        #   different experts.
        # self.experts() then combines the information from these different modalities
        #   by multiplying the gaussians together
        z_loc = torch.zeros(torch.Size((1, BATCH_SIZE, self.z_dim))) + 0.5
        z_scale = torch.ones(torch.Size((1, BATCH_SIXE, self.z_dim))) * 0.1
        if self.use_cuda:
            z_loc, z_scale = z_loc.cuda(), z_scale.cuda()

        if images is not None:
            image_z_loc, image_z_scale = self.image_encoder.forward(images)
            z_loc = torch.cat((z_loc, image_z_loc.unsqueeze(0)), dim=0)
            z_scale = torch.cat((z_scale, image_z_scale.unsqueeze(0)), dim=0)

        if emotions is not None:
            emotion_z_loc, emotion_z_scale = self.emotion_encoder.forward(emotions)
            z_loc = torch.cat((z_loc, emotion_z_loc.unsqueeze(0)), dim=0)
            z_scale = torch.cat((z_scale, emotion_z_scale.unsqueeze(0)), dim=0)

        z_loc, z_scale = self.experts(z_loc, z_scale)
        return z_loc, z_scale

    
    # define a helper function for reconstructing images
    def reconstruct_img(self, images):
        # encode image x
        z_loc, z_scale = self.image_encoder(images)
        # sample in latent space
        z = dist.Normal(z_loc, z_scale).sample()
        # decode the image (note we don't sample in image space)
        img_loc = self.image_decoder.forward(z)
        return img_loc

    
    # define a helper function for reconstructing images without sampling
    def reconstruct_img_nosample(self, images):
        # encode image x
        z_loc, z_scale = self.image_encoder(images)
        ## sample in latent space
        #z = dist.Normal(z_loc, z_scale).sample()
        # decode the image (note we don't sample in image space)
        img_loc = self.image_decoder.forward(z_loc)
        return img_loc

In [10]:
pyro.clear_param_store()

class Args:
    learning_rate = 5e-6
    num_epochs = 2 #500
    z_dim = DEFAULT_Z_DIM
    seed = 30
    cuda = False
    
args = Args()

# setup the VAE
mvae = MVAE(z_dim=args.z_dim, use_cuda=args.cuda)

# setup the optimizer
adam_args = {"lr": args.learning_rate}
optimizer = Adam(adam_args)

# setup the inference algorithm
svi = SVI(mvae.model, mvae.guide, optimizer, loss=Trace_ELBO())

In [11]:
import time
from tqdm import tqdm

train_elbo = []
trainingTimes = [time.time()]
# training loop
for epoch in range(args.num_epochs):
    # initialize loss accumulator
    epoch_loss = 0.
    # do a training epoch over each mini-batch returned
    # by the data loader
    for batch_num, sample in tqdm(enumerate(trainset_loader)):
        faces, emotions = sample['image'], sample['cat']
        
        # if on GPU put mini-batch into CUDA memory
        if args.cuda:
            faces, ratings, outcomes = faces.cuda(), ratings.cuda(), outcomes.cuda()
        
        # do ELBO gradient and accumulate loss
        #print("Batch: ", batch_num, "out of", len(train_loader))
        epoch_loss += svi.step(images=faces, emotions=emotions)
        epoch_loss += svi.step(images=faces, emotions=None)
        epoch_loss += svi.step(images=None, emotions=emotions)
        epoch_loss += svi.step(images=None, emotions=None)

    # report training diagnostics
    normalizer_train = len(trainset_loader)
    total_epoch_loss_train = epoch_loss / normalizer_train
    train_elbo.append(total_epoch_loss_train)
    
    # report training diagnostics
    trainingTimes.append(time.time())
    epoch_time = trainingTimes[-1] - trainingTimes[-2]
    print("[epoch %03d]  time: %.2f, average training loss: %.4f" % (epoch, epoch_time, total_epoch_loss_train))
    #if ((epoch+1) % 50 == 0):
        #pyro.get_param_store().save('trained_models/checkpoints/tutorial_mvae_pretrained_' + str(epoch) + '.save')
        

0it [00:01, ?it/s]

torch.Size([8, 256, 9, 9])





RuntimeError: shape '[-1, 20736]' is invalid for input of size 240000
                                      Trace Shapes:                    
                                       Param Sites:                    
             mvae$$$image_encoder.features.0.weight  32   3     4     4
             mvae$$$image_encoder.features.2.weight  64  32     4     4
             mvae$$$image_encoder.features.3.weight                  64
               mvae$$$image_encoder.features.3.bias                  64
             mvae$$$image_encoder.features.5.weight 128  64     4     4
             mvae$$$image_encoder.features.6.weight                 128
               mvae$$$image_encoder.features.6.bias                 128
             mvae$$$image_encoder.features.8.weight 256 128     4     4
             mvae$$$image_encoder.features.9.weight                 256
               mvae$$$image_encoder.features.9.bias                 256
          mvae$$$image_encoder.z_loc_layer.0.weight           512 20736
            mvae$$$image_encoder.z_loc_layer.0.bias                 512
          mvae$$$image_encoder.z_loc_layer.3.weight            50   512
            mvae$$$image_encoder.z_loc_layer.3.bias                  50
        mvae$$$image_encoder.z_scale_layer.0.weight           512 20736
          mvae$$$image_encoder.z_scale_layer.0.bias                 512
        mvae$$$image_encoder.z_scale_layer.3.weight            50   512
          mvae$$$image_encoder.z_scale_layer.3.bias                  50
             mvae$$$image_decoder.upsample.0.weight         20736    50
               mvae$$$image_decoder.upsample.0.bias               20736
          mvae$$$image_decoder.hallucinate.0.weight 256 128     4     4
          mvae$$$image_decoder.hallucinate.1.weight                 128
            mvae$$$image_decoder.hallucinate.1.bias                 128
          mvae$$$image_decoder.hallucinate.3.weight 128  64     4     4
          mvae$$$image_decoder.hallucinate.4.weight                  64
            mvae$$$image_decoder.hallucinate.4.bias                  64
          mvae$$$image_decoder.hallucinate.6.weight  64  32     4     4
          mvae$$$image_decoder.hallucinate.7.weight                  32
            mvae$$$image_decoder.hallucinate.7.bias                  32
          mvae$$$image_decoder.hallucinate.9.weight  32   3     4     4
                  mvae$$$emotion_encoder.net.weight           512     8
                    mvae$$$emotion_encoder.net.bias                 512
        mvae$$$emotion_encoder.z_loc_layer.0.weight           512   512
          mvae$$$emotion_encoder.z_loc_layer.0.bias                 512
        mvae$$$emotion_encoder.z_loc_layer.2.weight            50   512
          mvae$$$emotion_encoder.z_loc_layer.2.bias                  50
      mvae$$$emotion_encoder.z_scale_layer.0.weight           512   512
        mvae$$$emotion_encoder.z_scale_layer.0.bias                 512
      mvae$$$emotion_encoder.z_scale_layer.2.weight            50   512
        mvae$$$emotion_encoder.z_scale_layer.2.bias                  50
                mvae$$$emotion_decoder.net.0.weight           512    50
                  mvae$$$emotion_decoder.net.0.bias                 512
  mvae$$$emotion_decoder.emotion_loc_layer.0.weight           512   512
    mvae$$$emotion_decoder.emotion_loc_layer.0.bias                 512
  mvae$$$emotion_decoder.emotion_loc_layer.2.weight             8   512
    mvae$$$emotion_decoder.emotion_loc_layer.2.bias                   8
mvae$$$emotion_decoder.emotion_scale_layer.0.weight           512   512
  mvae$$$emotion_decoder.emotion_scale_layer.0.bias                 512
mvae$$$emotion_decoder.emotion_scale_layer.2.weight             8   512
  mvae$$$emotion_decoder.emotion_scale_layer.2.bias                   8
                                      Sample Sites:                    