In [58]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, Lambda, LSTM, RepeatVector, TimeDistributed, Flatten
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
from sklearn.cluster import KMeans

# Load the original dataset
processed_data = pd.read_csv('Processed Data/Aruba_17/processed_data.csv')
# only use the first 1000 rows
processed_data = processed_data.head(1280000)
# Extract the relevant columns from the dataset
timestamp = processed_data['Timestamp'].values
device_id = processed_data['Device ID'].values
status = processed_data['Status'].values
activity = processed_data['Activity'].values
activity_status = processed_data['Activity Status'].values

In [56]:
# Prepare the data for input into the VAE model
X = np.stack((timestamp, device_id, status, activity, activity_status), axis=1)

# Normalize the data using minMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Use KMeans to cluster sequences into 14 different groups
kmeans = KMeans(n_clusters=14, random_state=0)
clusters = kmeans.fit_predict(X)

# Split the data into training and testing sets
batch_size = 32
validation_split = 0.2
timesteps = 32 # number of previous records considered
input_dim = X.shape[1] # number of features, there are 5 features in the dataset

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, clusters, test_size=validation_split, shuffle=False)

# Pad the data to ensure it is divisible by the desired shape
remainder_train = X_train.shape[0] % (batch_size * timesteps)
if remainder_train > 0:
    X_train = np.concatenate([X_train, np.zeros((batch_size * timesteps - remainder_train, input_dim))])
    y_train = np.concatenate([y_train, np.zeros((batch_size * timesteps - remainder_train,))])
    
remainder_val = X_val.shape[0] % (batch_size * timesteps)
if remainder_val > 0:
    X_val = np.concatenate([X_val, np.zeros((batch_size * timesteps - remainder_val, input_dim))])
    y_val = np.concatenate([y_val, np.zeros((batch_size * timesteps - remainder_val,))])

# Reshape the datasets to have the correct shape for the model
X_train = X_train.reshape((-1, timesteps, input_dim))
y_train = y_train.reshape((-1, timesteps))

X_val = X_val.reshape((-1, timesteps, input_dim))
y_val = y_val.reshape((-1, timesteps))

# Change batch_size to be equal to X_train.shape[0]
# batch_size = X_train.shape[0]
print(X_train.shape[0])

latent_dim = 2
encoding_dim = 32

# Set the input shape for the autoencoder model
inputs = Input(batch_shape=(batch_size, timesteps, input_dim), name='encoder_input')
x = LSTM(encoding_dim*2, return_sequences=True)(inputs) # Add LSTM layer with return_sequences set to True
x = LSTM(encoding_dim, return_sequences=False)(x) # Add another LSTM layer with return_sequences set to False
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# encoder.summary()

# The decoder
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(encoding_dim, activation='relu')(latent_inputs)
x = RepeatVector(timesteps)(x)
x = LSTM(encoding_dim, return_sequences=True)(x)
x = LSTM(encoding_dim*2, return_sequences=True)(x)
outputs = TimeDistributed(Dense(input_dim))(x)
outputs = Flatten()(outputs)
# decoder_outputs = Dense(input_dim, activation='sigmoid')(x)

decoder = Model(latent_inputs, outputs, name='decoder')
# decoder.summary()

# vae_outputs = decoder(encoder(inputs)[2])
vae_outputs = decoder(Lambda(sampling)([z_mean, z_log_var]))
vae = Model(inputs, vae_outputs, name='vae')

reconstruction_loss = binary_crossentropy(K.flatten(inputs), K.flatten(vae_outputs))
reconstruction_loss = K.mean(reconstruction_loss)
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.mean(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = reconstruction_loss + kl_loss
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
# vae.summary()

# batch_size = X_train.shape[0]
# num_samples = X_train.shape[0]
# steps_per_epoch = num_samples // batch_size
batch_size = 32
num_epochs = 100
history = vae.fit(X_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, y_val))

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
# print(encoder_model.layers[0].input_shape)
print('The shape of the encoder model is: ', encoder_model.layers[0].input_shape)
X_embedded = encoder_model.predict(X_train, batch_size=batch_size)
# X_embedded = encoder_model.predict(X, batch_size=batch_size)

# X_embedded = encoder_model.predict(X.reshape((int(X.shape[0]/timesteps), timesteps, input_dim)), batch_size=batch_size)

# Train a classifier on the embeddings
classifier = KMeans(n_clusters=14, random_state=0)
y_pred = classifier.fit_predict(X_embedded)

# Generate a fake dataset using the VAE model
# n_samples = len(processed_data)
# print(n_samples)
# noise = np.random.normal(size=(n_samples, 5 - latent_dim))
# noise = np.concatenate([noise, np.zeros((n_samples, latent_dim))], axis=-1)
# # reshape noise to have the correct shape
# noise = noise.reshape((int(noise.shape[0]/timesteps), timesteps, input_dim))
# predicted_values = vae.predict(noise, batch_size=batch_size)

# predicted_values = predicted_values.reshape((predicted_values.shape[0] * predicted_values.shape[1], predicted_values.shape[2]))

# # undo the normalization
# predicted_values = scaler.inverse_transform(predicted_values)

# # Round each of the values in the array to the nearest integer
# predicted_values = np.rint(predicted_values)

# # Assign cluster labels to each of the predicted values
# y_pred = kmeans.predict(predicted_values)


160
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
The shape of the encoder model is:  [(32, 32, 5)]


In [None]:

# Generate a fake dataset using the VAE model
n_samples = len(X_train)
print(processed_data.shape)
noise = np.random.normal(size=(n_samples, timesteps, input_dim))
predicted_values = vae.predict(noise, batch_size=batch_size)
# reshape predicted values to have the correct shape
predicted_values = np.reshape(predicted_values, (n_samples*timesteps, input_dim))

# undo the normalization
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Assign cluster labels to each of the predicted values
y_pred = classifier.predict(encoder_model.predict(predicted_values, batch_size=batch_size))

# Reshape y_pred to have the same shape as predicted_values
y_pred = np.reshape(y_pred, (n_samples, timesteps))

# Flatten the y_pred array to a 1D array of cluster labels
y_pred = y_pred.flatten()

# Create a new DataFrame with the desired column names and values
predicted_data = pd.DataFrame.from_records(predicted_values, columns=['Timestamp', 'Device ID', 'Status', 'Activity', 'Activity Status'])
predicted_data['Cluster'] = y_pred
predicted_data = predicted_data.groupby('Timestamp', group_keys=False).apply(lambda x: x.sample(timesteps)).reset_index(drop=True)
predicted_data.to_csv('Predictions/Aruba_17_prediction.csv', index=False)


In [57]:

# Generate a fake dataset using the VAE model
n_samples = len(processed_data)

noise = np.random.normal(size=(n_samples, timesteps, input_dim))
predicted_values = vae.predict(noise, batch_size=batch_size)
# reshape predicted values to have the correct shape
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Reshape predicted_values to match the input shape of encoder_model
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# # Assign cluster labels to each of the predicted values
# y_pred = classifier.predict(encoder_model.predict(predicted_values, batch_size=batch_size))
# # Print all information of the y_pred line above

# # Reshape y_pred to match the shape of predicted_values
# y_pred = np.reshape(y_pred, (n_samples, timesteps))

# Save the prediction data to a new file 'predicted_Data.csv'
predicted_data = pd.DataFrame(predicted_values.reshape((-1, input_dim)), columns=['Timestamp', 'Device ID', 'Status', 'Activity', 'Activity Status'])
# predicted_data['Cluster'] = y_pred.reshape(-1)
predicted_data.to_csv('Predictions/Aruba_17_prediction.csv', index=False)

