# An attempt to extend VAE by Liang et al (2018) with non-binarized data

In [3]:
import numpy as np
import numpy.matlib
import pandas as pd
from tqdm.auto import tqdm
import math
import copy
from sklearn.metrics import mean_squared_error

import utils
import models

import tensorflow as tf
print(tf.__version__)

2.2.0


In [4]:
fold_1 = pd.read_csv('fold_1.csv', index_col = 0)
fold_2 = pd.read_csv('fold_2.csv', index_col = 0)
fold_3 = pd.read_csv('fold_3.csv', index_col = 0)
fold_4 = pd.read_csv('fold_4.csv', index_col = 0)
fold_5 = pd.read_csv('fold_5.csv', index_col = 0)

train_data = pd.concat([fold_1, fold_2, fold_3, fold_4])
test_data = fold_5

print(train_data.shape)
print(test_data.shape)

(941562, 3)
(235390, 3)


In [5]:
train_data.head()

Unnamed: 0,row,col,Prediction
15455,0,9,5
720365,0,604,5
1111258,0,920,3
1002582,0,791,3
1155420,0,981,2


# Represent data as user vectors containing ratings

x_u should be a vector of length 1000 containing observed ratings and zeroes at non-observed indices. I.e. want the 10000x1000 observations matrix with zero imputing

In [6]:
train_dataset = np.zeros([10000,1000])

for i in tqdm(np.arange(train_data.shape[0])):
    row_ind = train_data.iloc[i,0]
    col_ind = train_data.iloc[i,1]
    pred = train_data.iloc[i,2]
    train_dataset[row_ind, col_ind] = pred

HBox(children=(FloatProgress(value=0.0, max=941562.0), HTML(value='')))




In [71]:
test_dataset = np.zeros([10000,1000])

for i in tqdm(np.arange(test_data.shape[0])):
    row_ind = test_data.iloc[i,0]
    col_ind = test_data.iloc[i,1]
    pred = test_data.iloc[i,2]
    test_dataset[row_ind, col_ind] = pred

HBox(children=(FloatProgress(value=0.0, max=235390.0), HTML(value='')))




In [72]:
print(train_dataset.shape, test_dataset.shape)
print(np.count_nonzero(train_dataset)/np.size(train_dataset))
print(np.count_nonzero(test_dataset)/np.size(test_dataset))

(10000, 1000) (10000, 1000)
0.0941562
0.023539


# Build Variational Autoencoder model in Keras

Use architecture as in paper with one hidden layer

Trying output layer with 1000 neurons (for 1000 movies) and no activation

(In paper they use softmax to model probability of clicks for each item but we want to predict a rating, i.e.
a regression problem)

Mainly following tutorial here: https://www.tensorflow.org/tutorials/generative/cvae


Current errors:
- should test only on observed values in test set, not on imputed zeroes


In [97]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer

In [98]:
class MultiVAE(tf.keras.Model):
    def __init__(self, latent_dim):
        super(MultiVAE, self).__init__()
        self.latent_dim = latent_dim
        
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(1000,)),
            tf.keras.layers.Dense(units = 600, activation = "tanh"),
            tf.keras.layers.Dense(units = latent_dim + latent_dim)
        ])
        
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
            tf.keras.layers.Dense(units = 600, activation = "tanh"),
            tf.keras.layers.Dense(units = 1000)
        ])
    
    @tf.function
    def sample(self, eps = None):
        if eps is None:
            eps = tf.random.normal(shape=(1, self.latent_dim)) ## This is 100 in TF tutorial? Why?
        return self.decode(eps) 
    
    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar
    
    def reparameterize(self, mean, logvar):
        eps = tf.random.normal(shape = mean.shape)
        return eps * tf.exp(logvar * .5) + mean
    
    def decode(self, z):
        output = self.decoder(z)
        return output

In [125]:
optimizer = tf.keras.optimizers.Adam(1e-4)

def log_normal_pdf(sample, mean, logvar, raxis=1):
    log2pi = tf.math.log(2. * np.pi)
    return tf.reduce_sum(
        -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi), axis=raxis)


# ELBO term as combination of mse and regularized KL-term
def compute_loss(model, x, beta):
    mean, logvar = model.encode(x)
    z = model.reparameterize(mean, logvar)
    x_pred = model.decode(z)
    x_pred = tf.cast(x_pred, dtype = tf.float64)
    
    mse = tf.math.reduce_sum((x - x_pred)**2)/tf.cast(tf.size((x - x_pred)**2), dtype = tf.float64) # tf.keras.losses.MSE was giving strange results
    #mse = np.mean((x - x_pred)**2)
    kld = tf.reduce_mean(-0.5 * tf.reduce_sum(1 + logvar - mean**2 - tf.exp(logvar), axis=1), axis = 0)
    kld = tf.cast(kld, dtype = tf.float64)
    return mse + beta*kld

@tf.function
def train_step(model, x, optimizer, beta):
    """
    Executes one training step and returns the loss.
    This function computes the loss and gradients, and uses the latter to
    update the model's parameters.
    """
    with tf.GradientTape() as tape:
        loss = compute_loss(model, x, beta)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    

In [121]:
latent_dim = 100 # as in paper

model = MultiVAE(latent_dim)

In [122]:
def generate_prediction(model, test_user):
    mean, logvar = model.encode(test_user)
    z = model.reparameterize(mean, logvar)
    predictions = model.sample(z)
    return predictions

In [130]:
# Training without beta/KL annealing

def train_model(model, train_dataset, epochs, beta):
    '''
    Train this after determining beta through annealing
    '''
    for epoch in range(1, epochs + 1):
        for train_x in train_dataset:
            train_step(model, train_x, optimizer, beta)
            
# Training with beta/KL annealing
# See section 2.2.2 in paper
def train_model_annealing(model, train_dataset, validation_dataset, epochs, anneal_beta = 1):
    '''
    Pick beta for which the optimal rmse is reached on the validation set and train again with this anneal_beta
    Trains model and returns array of losses and rmse metric
    '''
    losses = np.array([])
    rmse = np.array([])
    beta = 0
    for epoch in range(1, epochs + 1):
        for train_x in train_dataset:
            train_step(model, train_x, optimizer, beta)
        
        epoch_loss = np.array([])
        epoch_mse = np.array([])
        ## ERROR IN THIS LOOP
        ## Should only test on observed values
        for validation_x in validation_dataset:
            epoch_loss = np.append(epoch_loss, compute_loss(model, validation_x, beta))
            predictions = generate_prediction(model, validation_x)
            epoch_mse = np.append(epoch_mse, mean_squared_error(validation_x, predictions))
        
        losses = np.append(losses, np.mean(epoch_loss))
        rmse = np.append(rmse, np.sqrt(np.mean(epoch_mse)))
        
        # Monotonically increase beta from 0 to anneal_beta
        beta = (epoch / epochs) * anneal_beta
    
    return losses, rmse
    

# Train model

In [133]:
# Note: need to convert to tensor before training

tf_train = tf.convert_to_tensor(train_dataset[np.newaxis,...])
tf_test = tf.convert_to_tensor(test_dataset[np.newaxis,...])

In [135]:
train_model_annealing(model, tf_train, tf_test, epochs = 50)







(array([ 0.53690468,  1.04528553,  1.5474204 ,  2.04460394,  2.53800978,
         3.02720174,  3.51200765,  3.99158465,  4.46678131,  4.93513123,
         5.39812809,  5.85494154,  6.30487348,  6.74731756,  7.18409595,
         7.61418885,  8.03754725,  8.45428805,  8.86396897,  9.26755031,
         9.66327738, 10.05107731, 10.43106905, 10.80238853, 11.16417875,
        11.51655099, 11.85896285, 12.19134453, 12.51309526, 12.82417238,
        13.12579452, 13.41625782, 13.69662183, 13.96667426, 14.22598532,
        14.47524586, 14.71421529, 14.94293638, 15.16187555, 15.37183358,
        15.57232708, 15.76452056, 15.94803547, 16.12338095, 16.29014647,
        16.44925392, 16.60053497, 16.74315131, 16.87915908, 17.00718397]),
 array([0.73291583, 0.7313737 , 0.73026982, 0.72895068, 0.72807288,
        0.72644675, 0.72526554, 0.72393495, 0.72242   , 0.7212948 ,
        0.71968825, 0.71860484, 0.71747353, 0.71606266, 0.71474266,
        0.71332228, 0.71204709, 0.71053558, 0.70960702, 0.708385

In [136]:
generate_prediction(model, tf.convert_to_tensor(train_dataset[np.newaxis,0,:]))

<tf.Tensor: shape=(1, 1000), dtype=float32, numpy=
array([[-4.94102128e-02, -2.88863247e-03,  1.78602472e-01,
        -3.41955572e-01, -3.26657653e-01,  8.76727998e-02,
         2.27693636e-02, -1.61580592e-01, -9.41136628e-02,
        -7.49104545e-02, -5.57398722e-02,  3.57925057e-01,
         2.88915128e-01, -5.22286057e-01, -8.59896690e-02,
        -2.21987754e-01,  2.48619348e-01,  1.66729555e-01,
        -4.78276312e-01, -2.48326346e-01, -4.67828631e-01,
        -1.08001694e-01, -2.65863717e-01, -1.20846629e-01,
         5.29498100e-01, -2.80808568e-01, -1.03622481e-01,
        -3.56401294e-01,  2.71313965e-01,  1.60622954e-01,
         5.04061460e-01,  2.15388700e-01, -2.39580393e-01,
         1.21649802e-02,  5.27485490e-01,  1.01203909e-02,
         4.34294283e-01,  3.30967695e-01,  9.44079012e-02,
        -2.26312160e-01,  1.67598769e-01,  2.74378210e-02,
         2.55852014e-01, -3.50164980e-01,  3.66278917e-01,
        -3.29786152e-01,  2.49628156e-01,  4.24291909e-01,
     