In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, Lambda, LSTM, RepeatVector, TimeDistributed, Flatten, Reshape
from keras.models import Model
from keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the original dataset
processed_data = pd.read_csv('Processed Data/Aruba_17/processed_data.csv')

Find the maximum number that can be evenly divisible by 32, given the length of the dataset, since the data needs to be evenly divisible by the batch size.

In [None]:
max_length = len(processed_data) - len(processed_data) % 32
processed_data = processed_data.head(3200)

# Extract the relevant columns from the dataset
timestamp = processed_data['Timestamp'].values
device_id = processed_data['Device ID'].values
status = processed_data['Status'].values
activity = processed_data['Activity'].values
activity_status = processed_data['Activity Status'].values

Old Normalization code, not used anymore since it resulted in a bunch of negative values.

In [None]:
# X = np.stack((timestamp, device_id, status, activity, activity_status), axis=1)

# # Normalize the data using z-score normalization
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# # Scale the values to be within the range of 0 to 1
# min_max_scaler = MinMaxScaler()
# X = min_max_scaler.fit_transform(X)

Implement Tensorboard to visualize the training process. This is not used in the final version of the code.

In [None]:
# Define the log directory for TensorBoard
log_dir = "logs/"

# Create a callback for TensorBoard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

Use stack to implement the data. The normalize the original way

In [None]:
# Prepare the data for input into the VAE model
X = np.stack((timestamp, device_id, status, activity, activity_status), axis=1)

# Normalize the data using minMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
# print(X.head(20))

There were 14 specific activity patterns I wanted to recognize, which is why I chose 14 as the number of clusters. However, this code needs to be studied and modified further. Next we have the batch size, validation split, timesteps, and input dimensions.

- #### The input dimensions represent the number of features in the dataset, in this case the number of features is 5.
- #### The batch size is chosen based on the following factors:
  - i) the amount of memory available for training
  - ii) the time it takes to train a single batch
  - iii) the variance in the loss function due to the stochastic nature of the gradient updates
  - A larger batch size can be faster to train and provide a more stable loss function, but they require more memory and may not generalize as well.
  - A smaller batch size may require more time to train and have a noisier loss function, but it can generalize better.
  - Current batch size is 32 based on research online. But this will likely be changed in the future.
- #### Validation Split:
  - The fraction of the dataset used for validation during training. It should be chosen to provide nough validatio ndata to accurately estimate the performace of the model without reducing the amount of data avaialble for training. Current validation split is 0.2, which is typical, and means 20% of the data is used for validation.
- #### Timesteps:
  - This is the number of previous records considered in the model. The value of timesteps should be chosen based on the time dependence of the data. If the data has long-term dependencies, a larger value of timesteps may be required to capture the dependencies. The current value is 128, but I will likely attempt increasing this value, as some patterns in the data have long term time dependence. Having a value of 128 means the previous 128 records are considered to predict the next record. 

In [None]:
# Use KMeans to cluster sequences into 14 different groups
kmeans = KMeans(n_clusters=14, random_state=0)
clusters = kmeans.fit_predict(X)

# Split the data into training and testing sets
batch_size = 32
validation_split = 0.2
timesteps = 128 # number of previous records considered
input_dim = X.shape[1] # number of features, there are 5 features in the dataset

The next block of code seperates the data into training and testing splits. Afterward the data is padded to ensure it is divisible by the desired shape. This was done because there was a constant issue of the data not being divisible by the batch size, even though the values it was returning were divisible. This code resolves the issue, and reshapes the data into the desired shape.

In [None]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, clusters, test_size=validation_split, shuffle=False)

# Pad the data to ensure it is divisible by the desired shape
remainder_train = X_train.shape[0] % (batch_size * timesteps)
if remainder_train > 0:
    X_train = np.concatenate([X_train, np.zeros((batch_size * timesteps - remainder_train, input_dim))])
    y_train = np.concatenate([y_train, np.zeros((batch_size * timesteps - remainder_train,))])
    
remainder_val = X_val.shape[0] % (batch_size * timesteps)
if remainder_val > 0:
    X_val = np.concatenate([X_val, np.zeros((batch_size * timesteps - remainder_val, input_dim))])
    y_val = np.concatenate([y_val, np.zeros((batch_size * timesteps - remainder_val,))])

# Reshape the datasets to have the correct shape for the model
X_train = X_train.reshape((-1, timesteps, input_dim))
y_train = y_train.reshape((-1, timesteps))

X_val = X_val.reshape((-1, timesteps, input_dim))
y_val = y_val.reshape((-1, timesteps))

The laten_dim is the dimension of the latent space in the VAE model. It represents the number of dimensions in which the data can be compressed while preserving most of its original information. A larger latent dimension can provide a more accurate reconstruction of the input data but may require more training data and computational resources.

The encoding_dim is the dimension of the hidden state in the encoder part of the VAE-RNN model. It represents the number of dimensions in which the data is compressed before being mapped to the latent space. A larger encoding dimension can provide a more accurate compression of the input data but may also require more computational resources.

Typical values if the laten_dim are around 2-4.
Typical values of the encoding_dim are around 32-64.

The optimal values for these hyperparameters will be determined though evaluation of the model once the validation code is implemented properly.

In [None]:
latent_dim = 2
encoding_dim = 32

The encoder takes input data and maps it to the latent space, which is a lower-dimensioned representation of the input data. 

The decoder is the secord part of the model that takes the output of the encoder and maps it back to the original input data space. 

The latent space is a liwer dimensional representation of the input data that is learned by the encoder. The latent space is typically much smaller than the original input data space, which allows the model to compress the input data.

##### LSTM
An LSTM layer is a type of Recurrent Neural Network (RNN) layer that can remember information over long periods. It is used to process sequential data, such as time-series data, and has a cell state that can store information over time.

The LSTM layer in the encoder is used to process the input data and extract relevant features.
The two LSTM layers in the encoder have different return_sequence values because the first LSTM layer returns a sequence of outputs for each time step, while the second LSTM layer returns only the last output of the sequence.

In the encoder:
First LSTM returns a sequence of outputs for each time step, which provides the model with more information about the input data. 
second LSTM returns only the last output of the sequence, which captures higher-level temporal information.

In the decoder:
The LSTM layer is used to decode the latent representation back into the original input data space.

##### Dense
The Dense layer is a fully connected layer that connects every neuron in one layer to every neuron in the next layer. It transforms the input data into a lower-dimensional representation, which is learned by the model.

The Dense layer in the decoder is used to reconstruct the original input data from the latent space.

##### Reshape
the Reshape layer is used to change the shape of the input data from a one-dimensional vector to a three-dimensional tensor. The Reshape layer is used to change the shape of the input data without changing its content. The LSTM layer in the encoder expects a three-dimensional tensor as input, so the Reshape layer is used to change the shape of the input data from a one-dimensional vector to a three-dimensional tensor.

##### TimeDistributed
This layer applies a layer to each time step of the input sequence independently. This is particularly useful in sequence-to-sequence models, where we want to apply a layer to each time step of the input sequence and obtain a corresponding output sequence.

the TimeDistributed layer is used in the Decoder to apply the Dense layer to each time step of the LSTM output sequence. The purpose is to generate a reconstructed sequence that is of the same length as the original input sequence, and where each time step is predicted independently based on the corresponding hidden state of the LSTM layer.

#### Purpose of the Sampling and LAMBDA layers

The sampling function and the Lambda layer are used for the reparameterization trick, which is a technique used in variational autoencoders (VAEs). VAEs are generative models that learn a low-dimensional representation (latent space) of the input data, which can be used to generate new data that resembles the original data.

In a VAE, the encoder maps the input data to a distribution in the latent space, which is usually a multivariate Gaussian distribution with a mean vector and a diagonal covariance matrix. The mean and the log-variance of the Gaussian distribution are outputs of the encoder. The z_mean and z_log_var in your code represent the mean and log-variance of the distribution, respectively.

However, the gradient of the stochastic gradient descent (SGD) algorithm cannot be propagated through random nodes like the sampling process. Therefore, we use the reparameterization trick, which allows us to sample from the distribution using a deterministic transformation of a random noise vector.

The sampling function takes z_mean and z_log_var as inputs, and returns a sample from the corresponding Gaussian distribution using a random noise vector generated by K.random_normal. The Lambda layer wraps the sampling function, so that it can be used as a Keras layer. The z output of the Lambda layer is the sampled vector in the latent space, which is used as input to the decoder.

In [None]:

# ==================== ENCODER ====================
inputs = Input(batch_shape=(batch_size, timesteps, input_dim), name='encoder_input')
x = LSTM(encoding_dim*2, return_sequences=True)(inputs)
x = LSTM(encoding_dim, return_sequences=False)(x) 
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# z_mean is the mean of the latent space
# z_log_var is the variance of the latent space

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
# encoder.summary()

# ================= LATENT SPACE ==================
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
# ==================== DECODER ====================
x = Dense(timesteps * encoding_dim, activation='relu')(latent_inputs)
x = Reshape((timesteps, encoding_dim))(x)
x = LSTM(encoding_dim, return_sequences=True, input_shape=(timesteps, encoding_dim))(x)
x = TimeDistributed(Dense(input_dim))(x)
# LSTM layer in the decoder is used to reconstruct the original sequence

Define the VAE model by combining the encoder and decoder models you previously defined. The VAE model takes in the same input as the encoder model (i.e., inputs) and outputs the same sequence as the decoder model (i.e., outputs).

The loss function is composed of two parts: the reconstruction loss and the KL divergence loss. The reconstruction loss measures the difference between the original input and the output of the VAE model, which is the reconstructed input. In this code, binary cross-entropy is used as the reconstruction loss. The KL divergence loss measures the difference between the latent space distribution and a prior distribution, which is usually a normal distribution. The KL divergence loss encourages the VAE to learn a compact representation of the input data. The vae_loss variable is the sum of the reconstruction loss and the KL divergence loss.

The loss function of VAE has two parts: the reconstruction loss and the KL divergence loss.
The reconstruction loss measures the difference between the original input and the output of the VAE model.
Binary cross-entropy is used as the reconstruction loss in the code.
The KL divergence loss measures the difference between the latent space distribution and a prior distribution, usually a normal distribution.
The KL divergence loss encourages the VAE to learn a compact representation of the input data.
The vae_loss variable is the sum of the reconstruction loss and the KL divergence loss.

In [None]:
# the VAE model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
decoder = Model(latent_inputs, x, name='decoder')
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

# Loss function
reconstruction_loss = K.mean(binary_crossentropy(K.flatten(inputs), K.flatten(outputs)))
reconstruction_loss *= timesteps * input_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.mean(kl_loss, axis=-1)
kl_loss *= -0.5
# vae_loss = reconstruction_loss + kl_loss
vae_loss = reconstruction_loss + kl_loss
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
# vae.summary()

Fit the VAE model to the training data

In [None]:

num_epochs = 100
history = vae.fit(X_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[tensorboard_callback])

plot_model(vae, to_file='model.png', show_shapes=True)
# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
# print(encoder_model.layers[0].input_shape)


Generate embeddings for each sequence in the training data using the encoder model. These embeddings capture the most important features of the input sequences and can be used for various downstream tasks, such as clustering, classification, or visualization. The resulting cluster assignments can provide insights into the structure of the data and can be used for further analysis or modeling.

In [None]:

X_embedded = encoder_model.predict(X_train, batch_size=batch_size)
# Potentially change from the encoder_model to the vae_model, it will be slower but will be more expressive and representative of the data.

# Train a classifier on the embeddings
y_pred = kmeans.fit_predict(X_embedded)

In [None]:
# Generate a fake dataset using the VAE model
n_samples = len(processed_data)

noise = np.random.normal(size=(n_samples, timesteps, input_dim))
predicted_values = vae.predict(noise, batch_size=batch_size)
# reshape predicted values to have the correct shape
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
# predicted_values = min_max_scaler.inverse_transform(predicted_values)
predicted_values = scaler.inverse_transform(predicted_values)
# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Save the prediction data to a new file 'predicted_Data.csv'
predicted_data = pd.DataFrame(predicted_values.reshape((-1, input_dim)), columns=['Timestamp', 'Device ID', 'Status', 'Activity', 'Activity Status'])
# predicted_data['Cluster'] = y_pred.reshape(-1)
# predicted_data.to_csv('Predictions/Aruba_17_prediction.csv', index=False)
with open('Predictions/Aruba_17_prediction.txt', 'w') as file:
    for _, row in predicted_data.iterrows():
        file.write(','.join(map(str, row.values)) + '\n')

In [None]:
# Plot the training and validation loss with x and y labels, and a grid
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid()
plt.legend()
# Validation loss > training loss, underfitting
# validation loss > training loss, overfitting, if it decreases and then increases again.
# If they both decreease and stabilize at a specific point, it is an optimal fit.

In [None]:
# # Plot the model
# from keras.utils import plot_model

# # Display the layers, number of layers, number of nodes etc
# plot_model(vae, to_file='vae.png', show_shapes=True, show_layer_names=True)

# # Load the image and display it
# img = plt.imread('vae.png')
# plt.figure(figsize=(16, 12))
# plt.imshow(img)
# plt.axis('off')
# plt.show()
