# Variational Autoencoder (VAE)

In [1]:
# Preprocessing
import pprint
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Deep Learning
from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras import backend as K
from keras.losses import mse, binary_crossentropy

# Saving Model
import os
from pickle import dump, load

# Making predictions
from sklearn.neighbors import NearestNeighbors

# Not used in current notebook
# from keras.utils import plot_model
# from keras.datasets import mnist
# import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# Load in data to pandas dataframe
songs = pd.read_csv("song_list5.csv")

In [3]:
songs.head()

Unnamed: 0,songid,artist,track,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,5X4Qm0rVLcZeeO4tSDmBg3,Jack Bruce,Running Thro' Our Hands,0.456,0.255,9.0,-15.805,1.0,0.048,0.946,0.17,0.951,0.0532,116.424,253067.0,4.0
1,1pNpt53PZPet9dvJN3RKGr,Prefuse 73,Parachute Panador,0.535,0.806,7.0,-10.289,1.0,0.0642,0.00436,0.0191,0.457,0.376,90.089,63733.0,4.0
2,3oxz2oCzAWdPzA6In2zA5u,Pasion Vega,La Gata Bajo La Lluvia,0.294,0.482,5.0,-6.406,1.0,0.043,0.463,0.0,0.335,0.204,166.693,255280.0,3.0
3,05JGVUwt7XJk5FPqH0Wsch,Jonny Lang,Walking Away,0.563,0.631,0.0,-5.144,1.0,0.0324,0.0635,8e-06,0.163,0.54,115.657,254827.0,4.0
4,3xdgCFMTn6ut8fZYxfAuR0,Skye,All the Promises,0.358,0.611,2.0,-9.752,0.0,0.0454,0.515,0.000468,0.149,0.164,171.596,256933.0,4.0


In [4]:
# Filter the columns I want into a features variable
features = songs[[
    "danceability", "energy", "key", "loudness", "mode", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", "tempo", 
    "duration_ms"]].to_numpy()

In [5]:
# View the feature data
print(features.shape)
pprint.pprint(features[0])

(49985, 12)
array([ 4.56000e-01,  2.55000e-01,  9.00000e+00, -1.58050e+01,
        1.00000e+00,  4.80000e-02,  9.46000e-01,  1.70000e-01,
        9.51000e-01,  5.32000e-02,  1.16424e+02,  2.53067e+05])


In [6]:
# instantiate the Scaler
scaler = StandardScaler()

In [7]:
# fit transform the scaler on our feature data
x_train = scaler.fit_transform(features)

In [8]:
# Save the scaler to a new file
# dump(scaler, open('scaler.pkl', 'wb'))

In [9]:
# View the Scaled data
print(x_train.shape)
pprint.pprint(x_train[0])

(49985, 12)
array([-0.47695143, -1.43616823,  1.03467011, -1.32564479,  0.70711739,
       -0.34028656,  1.91669133, -0.14321669,  3.87500921, -1.70714343,
       -0.17066299,  0.03404049])


In [10]:
# reparameterization method for lambda layer
# check this link out for research
# https://stats.stackexchange.com/questions/199605/how-does-the-reparameterization-trick-for-vaes-work-and-why-is-it-important
def sampling(args):
    z_mean, z_log_sigma = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_sigma) * epsilon 
    #      z_mean + e^(.5 * z_log_sigma)     * ϵ
    #                      φ

## Build the model

In [11]:
# build encoder model
encoder_input = Input(shape=(12,), name='Encoder_Input')

encoder_dense1 = Dense(512, activation='relu', name='Encoder_Dense1')(encoder_input)
encoder_dense2 = Dense(256, activation='relu', name='Encoder_Dense2')(encoder_dense1)
encoder_dense3 = Dense(128, activation='relu', name='Encoder_Dense3')(encoder_dense2)

# we need 2 Latent Sized Dense Layers for reparameterization
z_mean = Dense(2, name='z_mean')(encoder_dense3)
z_log_sigma = Dense(2, name='z_log_sigma')(encoder_dense3)

# Use Lambda layer to apply the sampling function (Reparameterization)
z = Lambda(sampling, output_shape=(2,), name='z')([z_mean, z_log_sigma])

# instantiate encoder model
encoder = Model(encoder_input, [z_mean, z_log_sigma, z], name='Encoder')
encoder.summary()
# plot_model(encoder, to_file='vae_mlp_encoder.png', show_shapes=True)

Model: "Encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Input (InputLayer)      (None, 12)           0                                            
__________________________________________________________________________________________________
Encoder_Dense1 (Dense)          (None, 512)          6656        Encoder_Input[0][0]              
__________________________________________________________________________________________________
Encoder_Dense2 (Dense)          (None, 256)          131328      Encoder_Dense1[0][0]             
__________________________________________________________________________________________________
Encoder_Dense3 (Dense)          (None, 128)          32896       Encoder_Dense2[0][0]             
____________________________________________________________________________________________

In [12]:
# build decoder model
latent_inputs = Input(shape=(2,), name='z_sampling')

decoder_dense1 = Dense(128, activation='relu', name='Decoder_Dense1')(latent_inputs)
decoder_dense2 = Dense(256, activation='relu', name='Decoder_Dense2')(decoder_dense1)
decoder_dense3 = Dense(512, activation='relu', name='Decoder_Dense3')(decoder_dense2)

decoder_output = Dense(12, name='Decoder_Output')(decoder_dense3)

# instantiate decoder model
decoder = Model(latent_inputs, decoder_output, name='Decoder')
decoder.summary()
# plot_model(decoder, to_file='vae_mlp_decoder.png', show_shapes=True)

Model: "Decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
z_sampling (InputLayer)      (None, 2)                 0         
_________________________________________________________________
Decoder_Dense1 (Dense)       (None, 128)               384       
_________________________________________________________________
Decoder_Dense2 (Dense)       (None, 256)               33024     
_________________________________________________________________
Decoder_Dense3 (Dense)       (None, 512)               131584    
_________________________________________________________________
Decoder_Output (Dense)       (None, 12)                6156      
Total params: 171,148
Trainable params: 171,148
Non-trainable params: 0
_________________________________________________________________


In [13]:
# instantiate VAE model
outputs = decoder(encoder(encoder_input)[2])
vae = Model(encoder_input, outputs, name='VAE_Model')
vae.summary()

Model: "VAE_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_Input (InputLayer)   (None, 12)                0         
_________________________________________________________________
Encoder (Model)              [(None, 2), (None, 2), (N 171396    
_________________________________________________________________
Decoder (Model)              (None, 12)                171148    
Total params: 342,544
Trainable params: 342,544
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Reconstuction Loss Function
reconstruction_loss = mse(encoder_input, outputs)
reconstruction_loss *= 12
# k1 Loss Function
kl_loss = 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.summary()
# plot_model(vae,
#             to_file='vae_mlp.png',
#             show_shapes=True)

Model: "VAE_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_Input (InputLayer)   (None, 12)                0         
_________________________________________________________________
Encoder (Model)              [(None, 2), (None, 2), (N 171396    
_________________________________________________________________
Decoder (Model)              (None, 12)                171148    
Total params: 342,544
Trainable params: 342,544
Non-trainable params: 0
_________________________________________________________________


In [15]:
vae.fit(x_train,
        epochs=3, # Set down to 3 for Demo Purposes
        batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x18637476c88>

In [16]:
# Current Best Loss 8.1934
# dump(encoder, open('VAE_Encoder.pkl', 'wb'))

# Making Predictions

In [17]:
# Load in the model
encoder_test = load(open('VAE_Encoder.pkl', 'rb'))

In [18]:
# get our latent features
preds = encoder_test.predict(x_train)
preds[0]

array([[-0.03490391, -2.8897223 ],
       [-0.2080349 ,  0.02317297],
       [ 0.30692878,  0.01386828],
       ...,
       [-1.6854535 , -0.79184806],
       [-0.77382445,  2.0584621 ],
       [ 2.3800442 , -0.77051663]], dtype=float32)

In [19]:
# Fit the nearest neighbors to our data
n_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=(n_neighbors+1), algorithm='ball_tree').fit(preds[0])

In [20]:
# Making our prediction
distances, indices = nbrs.kneighbors(preds[0])

In [21]:
# Results
songs.iloc[indices[0]]

Unnamed: 0,songid,artist,track,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,5X4Qm0rVLcZeeO4tSDmBg3,Jack Bruce,Running Thro' Our Hands,0.456,0.255,9.0,-15.805,1.0,0.048,0.946,0.17,0.951,0.0532,116.424,253067.0,4.0
48250,4lcMjDKPJdsXGeP97mnbkJ,Bright Eyes,The Biggest Lie,0.386,0.102,0.0,-19.3,1.0,0.0626,0.894,0.000327,0.891,0.325,152.483,168067.0,4.0
49381,0mTPxzmCSRpjPxJLqyLEQA,Mose Allison,How Much Truth (Live),0.519,0.1,1.0,-18.189,1.0,0.0676,0.944,0.000229,0.717,0.239,137.507,170640.0,4.0
2477,7EWWSCdsJXQWvk1lRMgO4y,Kris Kristofferson,The Captive,0.519,0.135,2.0,-16.851,1.0,0.0353,0.812,1.4e-05,0.686,0.312,109.331,195013.0,4.0
39968,1Yj4O4nnZPcNuPL2tmOxUZ,The Irish Tenors,Galway Bay,0.284,0.291,10.0,-14.479,1.0,0.0511,0.858,2.8e-05,0.828,0.237,89.506,175733.0,4.0
31110,5AwcHzpib6rajOz4bm1Ytn,Slapp Happy,Small Hands Of Stone,0.52,0.239,10.0,-13.753,1.0,0.0348,0.879,3e-06,0.681,0.0736,110.381,203080.0,3.0
