In [27]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import pandas as pd

from keras import backend as K

from keras.layers import Input, Dense, Lambda, Layer, Add, Multiply
from keras.models import Model, Sequential
from utils import get_most_popular_diagnoses, TrainValTensorBoard
from sklearn.model_selection import train_test_split

In [28]:
def nll(y_true, y_pred):
    """ Negative log likelihood (Bernoulli). """

    # keras.losses.binary_crossentropy gives the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)


class KLDivergenceLayer(Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs

In [32]:
original_dim = 355
intermediate_dim = 256
latent_dim = 10
batch_size = 64
epochs = 70
epsilon_std = 1.0

In [48]:
decoder = Sequential([
    Dense(intermediate_dim, input_dim=latent_dim, activation='relu'),
    Dense(original_dim, activation='sigmoid')
])

x = Input(shape=(original_dim,))
h = Dense(intermediate_dim, activation='relu')(x)

z_mu = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])
z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)

eps = Input(tensor=K.random_normal(stddev=epsilon_std,
                                   shape=(K.shape(x)[0], latent_dim)))
z_eps = Multiply()([z_sigma, eps])
z = Add()([z_mu, z_eps])

x_pred = decoder(z)

vae = Model(inputs=[x, eps], outputs=x_pred)
vae.compile(optimizer='rmsprop', loss=nll)

In [49]:
X = np.load('./data/topics_train_ngramm.npy')

In [50]:
X_train,X_test =train_test_split(X)

In [40]:
# train the VAE on MNIST digits

vae.fit(X_train,
        X_train,
        shuffle=True,
        verbose=0,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, X_test),
        callbacks=[
        TrainValTensorBoard(log_dir='./topics_vae_logs')
    ])

encoder = Model(x, z_mu)

In [51]:
encoded_topics = encoder.predict(X)

In [52]:
encoded_topics.shape

(61976, 10)

In [53]:
np.save('./data/train_encoded_topics.npy',encoded_topics)

In [45]:
test = np.load("./data/topics_test_ngramm.npy")

In [46]:
encoded_test = encoder.predict(test)

In [47]:
np.save('./data/test_encoded_topics.npy',encoded_test)