In [1]:
KafkaServerIP = dbutils.widgets.get("ip")
KafkaServerPort = dbutils.widgets.get("port")
srcTopicName = dbutils.widgets.get("topicname")

frameSize = [int(i) for i in dbutils.widgets.get("frameSize").split(":")]

In [2]:
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import Row, SparkSession

from keras.models import model_from_json
from keras.models import Sequential
from keras.layers import Conv2D, Dropout, Merge, Dense
from keras.layers.embeddings import Embedding
from keras.callbacks import TensorBoard
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.layers import advanced_activations
from keras import initializers
from keras.layers.pooling import GlobalMaxPooling2D

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Lambda, Input, Dense, Conv2D, Flatten, Reshape, Conv2DTranspose
from tensorflow.keras.losses import mse, binary_crossentropy, kl_divergence
from tensorflow.keras import optimizers
from tensorflow.keras import backend as K

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()


import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
"""
This spark session isn't needed in DATABRICKS,
uncomment the next line if you do not wish to
execute the notebook in Databricks
"""
#spark = (SparkSession.builder.master("local[8]").config("spark.driver.cores", 8).appName("KerasStream").getOrCreate() )


sc = spark.sparkContext
ssc = StreamingContext(sc, 3)

kafkaStream = KafkaUtils.createDirectStream(ssc, [f'{srcTopicName}'], {"metadata.broker.list": f'{KafkaServerIP}:{KafkaServerPort}'})

def getSparkSessionInstance(sparkConf):
    if ("sparkSessionSingletonInstance" not in globals()):
        globals()["sparkSessionSingletonInstance"] = SparkSession \
            .builder \
            .config(conf=sparkConf) \
            .getOrCreate()
    return globals()["sparkSessionSingletonInstance"]

In [4]:
#computes latent space parameters
def latent(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


# computes the reconstruction loss
def total_loss(true, pred):
    reconstruction_loss = binary_crossentropy(K.flatten(true), K.flatten(pred)) * reshaped_train.shape[1] * reshaped_train.shape[2]   # KL divergence loss
    # KL loss
    kl_loss = - 0.5 * K.sum(1 + sigma - K.square(mu) - K.square(K.exp(sigma)), axis=-1)
    total = K.mean(reconstruction_loss + kl_loss)    
    return total

#computes rmse 
def RMSE(v1, v2):
    return np.sqrt(np.mean((v1 - v2) ** 2, axis=1))


In [5]:
def vae_model():
    
    input_shape = (30 , frameSize[0], frameSize[1])
    latent_dim = 2
    # Encoder model
    encoder_inputs = Input(shape=input_shape, name='encoder_input')

    encoder_conv = Conv2D(filters=8, kernel_size=3, strides=2, \
                    padding='same',activation='relu')(encoder_inputs)
    encoder_conv = Conv2D(filters=16, kernel_size=3, strides=2, \
                    padding='same',activation='relu')(encoder_inputs)
    encoder = Flatten()(encoder_conv)

    mu = Dense(latent_dim, name='mu')(encoder)
    sigma = Dense(latent_dim, name='sigma')(encoder)
    latent_space = Lambda(latent, output_shape=(latent_dim,))([mu, sigma])
    encoder = Model(encoder_inputs, latent_space, name='encoder')
    
    conv_shape = K.int_shape(encoder_conv)
    # decoder model
    inputs = Input(shape=(latent_dim,))

    decoder = Dense(conv_shape[1]*conv_shape[2]*conv_shape[3], activation='relu')(inputs)
    decoder = Reshape((conv_shape[1], conv_shape[2], conv_shape[3]))(decoder)
    decoder_conv = Conv2DTranspose(filters=16, kernel_size=3, strides=2, 
                               padding='same',activation='relu')(decoder)
    decoder_conv = Conv2DTranspose(filters=8, kernel_size=3, strides=2, 
                               padding='same',activation='relu')(decoder)
    decoder_conv =  Conv2DTranspose(filters=reshaped_train.shape[3], kernel_size=3, 
                              padding='same',activation='sigmoid')(decoder_conv)

    decoder = Model(inputs, decoder_conv, name='decoder')
    #combines the encodr and decoder to define the model
    outputs = decoder(encoder(encoder_inputs))
    vae = Model(encoder_inputs, outputs, name='vae_model')
    
    opt = optimizers.Adam(learning_rate=0.001)
    vae.compile(optimizer=opt, loss=total_loss)
    return vae

In [6]:
core_model = vae_model()
model_json = core_model.to_json()
with open("model.json", "w") as f:
    f.write(model_json)
    

broadcast_model = sc.broadcast(model_json)

def learn_infer(data):
    loaded_model = model_from_json(broadcast_model.value)
    loaded_model.load_weights("./myWeights.h5")
    

instances = kafkaStream.map(lambda x: x[1])

def myMap(time, rdd):
    spark = getSparkSessionInstance(rdd.context.getConf())

    rowRdd = rdd.map(lambda w: Row(sen=learn_infer(w)))
    dfrows = spark.createDataFrame(rowRdd)
    dfrows.show()



instances.foreachRDD(myMap)
ssc.start()

In [7]:
dbutils.notebook.exit("The distributed training has started successfully")