In [7]:
# Import the necessary libraries
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.layers import Input, Dense, Lambda, LSTM, TimeDistributed, Reshape, Bidirectional, Masking
from keras.models import Model 
from keras.utils import pad_sequences
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
import matplotlib.pyplot as plt
from sliding_window import read_data, segment_data_by_day, sliding_window
# from custom_penalty import custom_penalty
from custom_layers import CustomPenaltyLayer
FILE_PATH = 'Processed Data/Aruba_17/processed_data.csv'

data_df = read_data(FILE_PATH)
daily_segments = segment_data_by_day(data_df)
daily_segments = daily_segments[:10]
windows = sliding_window(daily_segments)

# Prepare the data
windows = np.asarray([window.to_numpy() for window in windows])

# Split the data into training and testing sets
batch_size = 32
validation_split = 0.2
timesteps = max([window.shape[0] for window in windows])
input_dim = windows[0].shape[1]

  windows = np.asarray([window.to_numpy() for window in windows])


In [8]:
# Create a new train-test split using the windows
window_train, window_val = train_test_split(list(windows), test_size=validation_split, shuffle=False)

# Normalize the data using minMaxScaler
scaler = MinMaxScaler()
window_train = [scaler.fit_transform(window) for window in window_train]
window_val = [scaler.transform(window) for window in window_val]

max_length = max([window.shape[0] for window in windows])
window_train_padded = pad_sequences(window_train, maxlen=max_length, dtype='float32', padding='post', value=-1)
window_val_padded = pad_sequences(window_val, maxlen=max_length, dtype='float32', padding='post', value=-1)

# Create tf.data.Dataset from the padded sequences
train_dataset = tf.data.Dataset.from_tensor_slices(window_train_padded).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices(window_val_padded).batch(batch_size)

latent_dim = 2
encoding_dim = 32

# ==================== ENCODER ====================
inputs = Input(shape=(max_length, input_dim), name='encoder_input')
mask = Masking(mask_value=-1.0)(inputs)  # Add masking layer
x = Bidirectional(LSTM(encoding_dim * 2, return_sequences=True))(mask)
x = Bidirectional(LSTM(encoding_dim, return_sequences=False))(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# z_mean is the mean of the latent space
# z_log_var is the variance of the latent space

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# ================= LATENT SPACE ==================
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
# ==================== DECODER ====================
x = Dense(timesteps * encoding_dim, activation='relu')(latent_inputs)
x = Reshape((timesteps, encoding_dim))(x)
x = Bidirectional(LSTM(encoding_dim, return_sequences=True, input_shape=(timesteps, encoding_dim)))(x)
x = TimeDistributed(Dense(input_dim))(x)
# LSTM layer in the decoder is used to reconstruct the original sequence

# the VAE model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
decoder = Model(latent_inputs, x, name='decoder')
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

# VAE loss function with custom_penalty
# reconstruction_loss = binary_crossentropy(K.flatten(inputs), K.flatten(outputs))
reconstruction_loss = K.mean(K.square(inputs - outputs))
reconstruction_loss *= timesteps * input_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5

# Add the custom penalty to the loss function
penalty_weight = 10.0  # Adjust the weight of the penalty term as needed
penalty_layer = CustomPenaltyLayer(scaler, input_dim)
penalty = penalty_layer(outputs)
penalty *= penalty_weight

vae_loss = K.mean(reconstruction_loss + kl_loss + penalty)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

num_epochs = 5
history = vae.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
X_embedded = encoder_model.predict(window_train, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


ValueError: Data cardinality is ambiguous:
  x sizes: 6653, 8395, 5849, 8491, 6240, 9884, 5444, 6655
Make sure all arrays contain the same number of samples.

In [None]:
# Generate a fake dataset using the VAE model
n_samples = len(windows)

# Sample from the latent space
z_samples = np.random.normal(size=(n_samples, latent_dim))

# Use the decoder to generate the output
predicted_values = decoder.predict(z_samples)
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# Undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Create the fake dataset in the original format
fake_dataset = []
for window in predicted_values.reshape((n_samples, timesteps, input_dim)):
    fake_dataset.extend(window)

# Save the fake dataset to a new file 'fake_dataset.txt'
fake_data = pd.DataFrame(fake_dataset, columns=['Date', 'Time', 'Device ID', 'Status', 'Activity', 'Activity Status'])

with open('Aruba_17_prediction.txt', 'w') as file:
    for _, row in fake_data.iterrows():
        file.write(','.join(map(str, row.values)) + '\n')

In [None]:
# Plot the training and validation loss with x and y labels, and a grid
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid()
plt.legend()
# Validation loss > training loss, underfitting
# validation loss > training loss, overfitting, if it decreases and then increases again.
# If they both decreease and stabilize at a specific point, it is an optimal fit.

# Plot the evaluation loss vs the iterations
plt.plot(history.history['loss'], label='Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid()


In [None]:
# # Plot the model
# from keras.utils import plot_model

# # Display the layers, number of layers, number of nodes etc
# plot_model(vae, to_file='vae.png', show_shapes=True, show_layer_names=True)

# # Load the image and display it
# img = plt.imread('vae.png')
# plt.figure(figsize=(16, 12))
# plt.imshow(img)
# plt.axis('off')
# plt.show()
