In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from keras.layers import Input, Dense, Lambda, LSTM, TimeDistributed, Reshape, Bidirectional
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
from keras import backend as K
import matplotlib.pyplot as plt
from oldslidingWindow import read_data, segment_data_by_day, sliding_window
from oldCustom_layers import CustomPenaltyLayer
from sklearn.ensemble import RandomForestClassifier

FILE_PATH = '../Processed Data/Aruba_17/processed_data.csv'

data_df = read_data(FILE_PATH)
daily_segments = segment_data_by_day(data_df)
# Use the first 10 days
# daily_segments = daily_segments[:10]
# Find the average length of the segments
print(len(daily_segments))
avg_length = 0
for segment in daily_segments:
    avg_length += len(segment)
avg_length /= len(daily_segments)
avg_length = int(avg_length)
print(avg_length)
window_size = avg_length
overlap_ratio = 0.2
windows = sliding_window(daily_segments, window_size=window_size, overlap_ratio=overlap_ratio)
print(len(windows))
# Prepare the data
window_labels = []
windows = np.asarray([window.to_numpy() for window in windows])
for window in windows:
    activity_label = window[0][4]
    activity_status_label = window[0][5]
    label = (activity_label, activity_status_label)
    window_labels.append(label)
# Split the data into training and testing sets
batch_size = 16  
validation_split = 0.3
timesteps = window_size
input_dim = windows[0].shape[1]

220
7816
187


: 

In [None]:
# Create a new train-test split using the windows
window_train, window_val = train_test_split(windows, test_size=validation_split, shuffle=False)

# Prepare the input data for the model by concatenating the windows along the time axis
X_train = np.concatenate(window_train, axis=0)
X_val = np.concatenate(window_val, axis=0)

# Normalize the data using minMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train = X_train.reshape((-1, timesteps, input_dim))
X_val = X_val.reshape((-1, timesteps, input_dim))

latent_dim = 2
encoding_dim = 32

# ==================== ENCODER ====================
inputs = Input(shape=(timesteps, input_dim), name='encoder_input')
# print('encoder input shape: ', inputs.shape)
x = Bidirectional(LSTM(encoding_dim * 2, return_sequences=True))(inputs)
# print('first encoder bidirectional lstm shape: ', x.shape)
x = Bidirectional(LSTM(encoding_dim, return_sequences=False))(x)
# print('second encoder bidirectional lstm shape: ', x.shape)
z_mean = Dense(latent_dim, name='z_mean')(x)
# print('encoder z_mean shape: ', z_mean.shape)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# print('encoder z_log_var shape: ', z_log_var.shape)
# z_mean is the mean of the latent space
# z_log_var is the variance of the latent space

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
# print('encoder z shape: ', z.shape)
# ================= LATENT SPACE ==================
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
# print('latent space input shape: ', latent_inputs.shape)
# ==================== DECODER ====================
x = Dense(timesteps * encoding_dim, activation='relu')(latent_inputs)
# print('decoder dense shape: ', x.shape)
x = Reshape((timesteps, encoding_dim))(x)
# print('decoder reshape shape: ', x.shape)
x = Bidirectional(LSTM(encoding_dim, return_sequences=True, input_shape=(timesteps, encoding_dim)))(x)
# print('decoder bidirectional lstm shape: ', x.shape)
x = TimeDistributed(Dense(input_dim))(x)
# print('decoder time distributed dense shape: ', x.shape)
# LSTM layer in the decoder is used to reconstruct the original sequence
# print the summary

# the VAE model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# print('encoder model shape: ', encoder.output_shape)
decoder = Model(latent_inputs, x, name='decoder')
# print('decoder model shape: ', decoder.output_shape)
outputs = decoder(encoder(inputs)[2])
# print('outputs shape: ', outputs.shape)
vae = Model(inputs, outputs, name='vae')
# print('vae model shape: ', vae.output_shape)
# print the summaries
# encoder.summary()
# decoder.summary()
# vae.summary()


# VAE loss function with custom_penalty
reconstruction_loss = K.mean(K.square(inputs - outputs))
reconstruction_loss *= timesteps * input_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5

# Add the custom penalty to the loss function
penalty_weight = 10.0  # Adjust the weight of the penalty term as needed
penalty_layer = CustomPenaltyLayer(scaler, input_dim, timesteps)
penalty = penalty_layer(outputs)
penalty *= penalty_weight

vae_loss = K.mean(reconstruction_loss + kl_loss + penalty)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
# vae.summary()

In [None]:
num_epochs = 250
history = vae.fit(X_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, None))

# Use the encoder to generate embeddings for each sequence
encoder_model = Model(inputs, z_mean)
X_embedded = encoder_model.predict(X_train, batch_size=batch_size)

In [None]:
# Generate a fake dataset using the VAE model
n_samples = len(windows)

# Sample from the latent space
z_samples = np.random.normal(size=(n_samples, latent_dim))

# Use the decoder to generate the output
predicted_values = decoder.predict(z_samples)
predicted_values = np.reshape(predicted_values, (n_samples, timesteps, input_dim))

# Undo the normalization
predicted_values = np.reshape(predicted_values, (-1, input_dim))
predicted_values = scaler.inverse_transform(predicted_values)

# Round each of the values in the array to the nearest integer
predicted_values = np.rint(predicted_values)

# Create the fake dataset in the original format
fake_dataset = []
for window in predicted_values.reshape((n_samples, timesteps, input_dim)):
    fake_dataset.extend(window)

# Save the fake dataset to a new file 'fake_dataset.txt'
fake_data = pd.DataFrame(fake_dataset, columns=['Date', 'Time', 'Device ID', 'Status', 'Activity', 'Activity Status'])

with open('../Predictions/Aruba_17_prediction_OLD.txt', 'w') as file:
    for _, row in fake_data.iterrows():
        file.write(','.join(map(str, row.values)) + '\n')

### Validation

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

y = np.array(window_labels)

def evaluate_model(y_true, y_pred):
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    weighted_precision = precision_score(y_true, y_pred, average='weighted')
    weighted_recall = recall_score(y_true, y_pred, average='weighted')
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    
    return weighted_f1, weighted_precision, weighted_recall, balanced_accuracy

# Split the embeddings and labels into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_embedded, y, test_size=0.3, shuffle=False)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_val)

weighted_f1, weighted_precision, weighted_recall, balanced_accuracy = evaluate_model(y_val, y_pred)

print("Weighted F1-score:", weighted_f1)
print("Weighted Precision:", weighted_precision)
print("Weighted Recall:", weighted_recall)
print("Balanced Accuracy:", balanced_accuracy)