In [29]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, Lambda, LSTM, RepeatVector, TimeDistributed, Flatten, Reshape
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the original dataset
processed_data = pd.read_csv('Processed Data/Aruba_17/processed_data.csv')
# Find the maximum number that can be evenly divisible by 32, given the length of the dataset
max_length = len(processed_data) - len(processed_data) % 32
processed_data = processed_data.head(3200)

# Extract the relevant columns from the dataset
timestamp = processed_data['Timestamp'].values
device_id = processed_data['Device ID'].values
status = processed_data['Status'].values
activity = processed_data['Activity'].values
activity_status = processed_data['Activity Status'].values

In [30]:
X = np.stack((timestamp, device_id, status, activity, activity_status), axis=1)

# Normalize the data using z-score normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Scale the values to be within the range of 0 to 1
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)

# Export the normalized data to a csv file
# np.savetxt('Processed Data/Aruba_17/normalized_data.csv', X, delimiter=',')

In [31]:
# Prepare the data for input into the VAE model
# X = np.stack((timestamp, device_id, status, activity, activity_status), axis=1)

# # Normalize the data using minMaxScaler
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# print(X.head(20))

# Use KMeans to cluster sequences into 14 different groups
kmeans = KMeans(n_clusters=14, random_state=0)
clusters = kmeans.fit_predict(X)

# Split the data into training and testing sets
batch_size = 32
validation_split = 0.2
timesteps = 128 # number of previous records considered
input_dim = X.shape[1] # number of features, there are 5 features in the dataset

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, clusters, test_size=validation_split, shuffle=False)

# Pad the data to ensure it is divisible by the desired shape
remainder_train = X_train.shape[0] % (batch_size * timesteps)
if remainder_train > 0:
    X_train = np.concatenate([X_train, np.zeros((batch_size * timesteps - remainder_train, input_dim))])
    y_train = np.concatenate([y_train, np.zeros((batch_size * timesteps - remainder_train,))])
    
remainder_val = X_val.shape[0] % (batch_size * timesteps)
if remainder_val > 0:
    X_val = np.concatenate([X_val, np.zeros((batch_size * timesteps - remainder_val, input_dim))])
    y_val = np.concatenate([y_val, np.zeros((batch_size * timesteps - remainder_val,))])

# Reshape the datasets to have the correct shape for the model
X_train = X_train.reshape((-1, timesteps, input_dim))
y_train = y_train.reshape((-1, timesteps))

X_val = X_val.reshape((-1, timesteps, input_dim))
y_val = y_val.reshape((-1, timesteps))

latent_dim = 2
encoding_dim = 32

# Set the input shape for the autoencoder model
inputs = Input(batch_shape=(batch_size, timesteps, input_dim), name='encoder_input')
x = LSTM(encoding_dim*2, return_sequences=True)(inputs) # Add LSTM layer with return_sequences set to True
x = LSTM(encoding_dim, return_sequences=False)(x) # Add another LSTM layer with return_sequences set to False
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
# encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# encoder.summary()

# The decoder
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(timesteps * encoding_dim, activation='relu')(latent_inputs)
x = Reshape((timesteps, encoding_dim))(x)
x = LSTM(encoding_dim, return_sequences=True, input_shape=(timesteps, encoding_dim))(x)
x = TimeDistributed(Dense(input_dim))(x)
# x = Dense(encoding_dim, activation='relu')(latent_inputs)
# x = RepeatVector(timesteps)(x)
# x = LSTM(encoding_dim, return_sequences=True)(x)
# x = LSTM(encoding_dim*2, return_sequences=True)(x)
# outputs = TimeDistributed(Dense(input_dim))(x)
# outputs = Flatten()(outputs)
# decoder_outputs = Dense(input_dim, activation='sigmoid')(x)

# decoder = Model(latent_inputs, outputs, name='decoder')
# decoder.summary()

# vae_outputs = decoder(encoder(inputs)[2])
# vae_outputs = decoder(Lambda(sampling)([z_mean, z_log_var]))
# vae = Model(inputs, vae_outputs, name='vae')

# Define the VAE model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
decoder = Model(latent_inputs, x, name='decoder')
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

reconstruction_loss = binary_crossentropy(K.flatten(inputs), K.flatten(outputs))
# reconstruction_loss = K.mean(reconstruction_loss)
reconstruction_loss *= timesteps * input_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
# kl_loss = K.mean(kl_loss, axis=-1)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
# vae_loss = reconstruction_loss + kl_loss
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
# vae.summary()

num_epochs = 600
history = vae.fit(X_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, y_val))

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
# print(encoder_model.layers[0].input_shape)

X_embedded = encoder_model.predict(X_train, batch_size=batch_size)

# Train a classifier on the embeddings
classifier = KMeans(n_clusters=14, random_state=0)
y_pred = classifier.fit_predict(X_embedded)

# Generate a fake dataset using the VAE model
# n_samples = len(processed_data)
# print(n_samples)
# noise = np.random.normal(size=(n_samples, 5 - latent_dim))
# noise = np.concatenate([noise, np.zeros((n_samples, latent_dim))], axis=-1)
# # reshape noise to have the correct shape
# noise = noise.reshape((int(noise.shape[0]/timesteps), timesteps, input_dim))
# predicted_values = vae.predict(noise, batch_size=batch_size)

# predicted_values = predicted_values.reshape((predicted_values.shape[0] * predicted_values.shape[1], predicted_values.shape[2]))

# # undo the normalization
# predicted_values = scaler.inverse_transform(predicted_values)

# # Round each of the values in the array to the nearest integer
# predicted_values = np.rint(predicted_values)

# # Assign cluster labels to each of the predicted values
# y_pred = kmeans.predict(predicted_values)


Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

In [None]:

# Generate a fake dataset using the VAE model
n_samples = len(X_train)
print(processed_data.shape)
noise = np.random.normal(size=(n_samples, timesteps, input_dim))
predicted_values = vae.predict(noise, batch_size=batch_size)
# reshape predicted values to have the correct shape
predicted_values = np.reshape(predicted_values, (n_samples*timesteps, input_dim))

# undo the normalization
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Assign cluster labels to each of the predicted values
y_pred = classifier.predict(encoder_model.predict(predicted_values, batch_size=batch_size))

# Reshape y_pred to have the same shape as predicted_values
y_pred = np.reshape(y_pred, (n_samples, timesteps))

# Flatten the y_pred array to a 1D array of cluster labels
y_pred = y_pred.flatten()

# Create a new DataFrame with the desired column names and values
predicted_data = pd.DataFrame.from_records(predicted_values, columns=['Timestamp', 'Device ID', 'Status', 'Activity', 'Activity Status'])
predicted_data['Cluster'] = y_pred
predicted_data = predicted_data.groupby('Timestamp', group_keys=False).apply(lambda x: x.sample(timesteps)).reset_index(drop=True)
predicted_data.to_csv('Predictions/Aruba_17_prediction.csv', index=False)


In [32]:

# Generate a fake dataset using the VAE model
n_samples = len(processed_data)

noise = np.random.normal(size=(n_samples, timesteps, input_dim))
predicted_values = vae.predict(noise, batch_size=batch_size)
# reshape predicted values to have the correct shape
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
predicted_values = min_max_scaler.inverse_transform(predicted_values)
predicted_values = scaler.inverse_transform(predicted_values)
# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Reshape predicted_values to match the input shape of encoder_model
# predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# # Assign cluster labels to each of the predicted values
# y_pred = classifier.predict(encoder_model.predict(predicted_values, batch_size=batch_size))
# # Print all information of the y_pred line above

# # Reshape y_pred to match the shape of predicted_values
# y_pred = np.reshape(y_pred, (n_samples, timesteps))

# Save the prediction data to a new file 'predicted_Data.csv'
predicted_data = pd.DataFrame(predicted_values.reshape((-1, input_dim)), columns=['Timestamp', 'Device ID', 'Status', 'Activity', 'Activity Status'])
# predicted_data['Cluster'] = y_pred.reshape(-1)
# predicted_data.to_csv('Predictions/Aruba_17_prediction.csv', index=False)
with open('Predictions/Aruba_17_prediction.txt', 'w') as file:
    for _, row in predicted_data.iterrows():
        file.write(','.join(map(str, row.values)) + '\n')



In [None]:
# Plot the training and validation loss with x and y labels, and a grid
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid()
plt.legend()
# Validation loss > training loss, underfitting
# validation loss > training loss, overfitting, if it decreases and then increases again.
# If they both decreease and stabilize at a specific point, it is an optimal fit.
