In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from keras.layers import Input, Dense, Lambda, LSTM, TimeDistributed, Reshape, Bidirectional
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
import matplotlib.pyplot as plt
from oldslidingWindow import read_data, segment_data_by_day, sliding_window
from PROCESSING import pre_processed_data, model_processing_code, model_post_processing, print_mappings, reconstructed_data, undo_split
from oldCustom_layers import CustomPenaltyLayer
from datetime import datetime
import pytz


In [None]:
eastern = pytz.timezone('US/Eastern')
current_time = datetime.now(eastern).strftime('%m%d')

RAW_DATA = '../Raw Data/Aruba_17/data'
PRE_PROCESSED = '../Processed Data/Aruba_17/pre_processed_data.csv'
FILE_PATH = '../Processed Data/Aruba_17/processed_data.csv'
PREDICTIONS = f'../Predictions/Aruba_17_prediction_{current_time}.txt'
COMPLETE_PREDICTION = f'../Predictions/Aruba_17_completed_prediction_{current_time}.txt'
VAL_INPUT = f'../Predictions/Aruba_{current_time}.txt'
pre_processed_data(RAW_DATA, PRE_PROCESSED)
mappings = model_processing_code(PRE_PROCESSED, FILE_PATH)
print_mappings(mappings['device_id_and_status_encoder'], mappings['activity_and_status_encoder'])
print(mappings['activity_and_status_encoder'])

In [None]:

data_df = read_data(FILE_PATH)
daily_segments = segment_data_by_day(data_df)
# Use the first 10 days
daily_segments = daily_segments[:50]
# Find the average length of the segments
activities = []
# write a loop that determines the number of activities per day, and appends to the activities list. Ignore the activity value of 22 since this correspoinds to a NaN value
for segment in daily_segments:
    for activity in segment["Activity"]:
        if activity != 22:
            activities.append(activity)
print(len(activities))
print(len(set(activities)))
print(activities)
print(len(daily_segments))
# print the average amount of activities per day
print(len(activities)/len(daily_segments))
avg_length = 0
for segment in daily_segments:
    avg_length += len(segment)
avg_length /= len(daily_segments)
avg_length = int(avg_length)
print("The average length per day is",avg_length)
window_size = avg_length
overlap_ratio = 0.2
windows = sliding_window(daily_segments, window_size=window_size, overlap_ratio=overlap_ratio)
print(len(windows))
# Print the length of a single window
print("The length of each window is",len(windows[1]))
# Prepare the data
window_labels = []
windows = np.asarray([window.to_numpy() for window in windows])
date_column = windows[:, :, :1]
windows = windows[:, :, 1:]

# Split the data into training and testing sets
batch_size = 16  
validation_split = 0.3
timesteps = window_size
input_dim = windows[0].shape[1]
print(windows.shape)
print(input_dim)

# print(windows)

In [None]:
# Create a new train-test split using the windows
window_train, window_val = train_test_split(windows, test_size=validation_split, shuffle=False)

print(window_train.shape)
print(window_val.shape)

# Prepare the input data for the model by concatenating the windows along the time axis
X_train = np.concatenate(window_train, axis=0)
X_val = np.concatenate(window_val, axis=0)

# Normalize the data using minMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train = X_train.reshape((-1, timesteps, input_dim))
X_val = X_val.reshape((-1, timesteps, input_dim))

latent_dim = 2
encoding_dim = 32

# ==================== ENCODER ====================
inputs = Input(shape=(timesteps, input_dim), name='encoder_input')
x = Bidirectional(LSTM(encoding_dim*2, return_sequences=True))(inputs)
x = Bidirectional(LSTM(encoding_dim*2, return_sequences=False))(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# z_mean is the mean of the latent space
# z_log_var is the variance of the latent space

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
# ================= LATENT SPACE ==================
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
# ==================== DECODER ====================
x = Dense(timesteps * encoding_dim, activation='relu')(latent_inputs)
x = Reshape((timesteps, encoding_dim))(x)
x = Bidirectional(LSTM(encoding_dim*2, return_sequences=True, input_shape=(timesteps, encoding_dim)))(x)
x = TimeDistributed(Dense(input_dim))(x)
# LSTM layer in the decoder is used to reconstruct the original sequence
# print the summary

# the VAE model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
decoder = Model(latent_inputs, x, name='decoder')
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')
# print the summaries
# encoder.summary()
# decoder.summary()
# vae.summary()

# VAE loss function with custom_penalty
reconstruction_loss = K.mean(K.square(inputs - outputs))
reconstruction_loss *= timesteps * input_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5

# Add the custom penalty to the loss function
penalty_weight = 10.0  # Adjust the weight of the penalty term as needed
penalty_layer = CustomPenaltyLayer(scaler, input_dim, timesteps)
penalty = penalty_layer(outputs)
penalty *= penalty_weight

vae_loss = K.mean(reconstruction_loss + kl_loss + penalty)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
# vae.summary()

In [None]:
num_epochs = 10
history = vae.fit(X_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, None))

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
X_embedded = encoder_model.predict(X_train, batch_size=batch_size)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
# Generate a fake dataset using the VAE model (SLIDING WINDOWS)

n_samples = len(windows)

# Sample from the latent space
z_samples = np.random.normal(size=(n_samples, latent_dim))

# Use the decoder to generate the output
predicted_values = decoder.predict(z_samples)
predicted_values = np.reshape(predicted_values, (n_samples, window_size, input_dim))

# Undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Reshape the predicted_values back into the window format
predicted_windows = np.reshape(predicted_values, (n_samples, window_size, input_dim))
predicted_windows_with_date_time = np.concatenate((date_column, predicted_windows), axis=2)

print(len(predicted_windows_with_date_time))
# print the types of the predicted_windows_with_date_time, window_size and overlap_ratio
print(type(predicted_windows_with_date_time))
print(type(window_size))
print(type(overlap_ratio))

original_data_format = reconstructed_data(predicted_windows_with_date_time, window_size, overlap_ratio)

print(original_data_format)
np.savetxt(PREDICTIONS, original_data_format, fmt='%s', delimiter=',', header='Date,Time,Device_Status,Activity', comments='')

# print(reconstructed_data)

In [None]:
# convert back to the original format
model_post_processing(PREDICTIONS, COMPLETE_PREDICTION, mappings)
undo_split(COMPLETE_PREDICTION, VAL_INPUT)