In [1]:
import os
import random
import pandas as pd
import numpy as np
import pickle
import h5py
from csv import writer
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../B_Model_Training')
from utils.DataPreparation_v2 import walk_forward, prepare_data

### Load Data

In [2]:
hdf5_file_path = "../../1_Data/datasets.h5"
h5 = h5py.File(hdf5_file_path, 'r')
print(list(h5.keys()))
dataset = pd.read_hdf(hdf5_file_path, 'Office_A')
dataset.head(2)
# Note that the Stjelja dataset was not published along with this script and needs to be removed to run it.

['Candanedo', 'Home', 'Office_A', 'Office_B', 'Simulated', 'Stjelja']


Unnamed: 0,Day,Time,CO2,Temperature,Humidity,Occupancy
0,0,00:00:00,739.73,26.3,29.73,0
1,0,00:01:00,740.08,26.31,29.74,0


### Prepare training and validation data

In [None]:
used_datasets = ['Candanedo', 'Office_A', 'Office_B', 'Home', 'Stjelja']
n_days = 8

data_samples = {
    15: dict.fromkeys(used_datasets), 30: dict.fromkeys(used_datasets), 
    60: dict.fromkeys(used_datasets), 'raw': dict.fromkeys(used_datasets)
}
for s in data_samples.keys():
    for u in used_datasets:
        data_samples[s][u] = []

print("selecting samples...")

for dataset_name in used_datasets:
    dataset = pd.read_hdf(hdf5_file_path, dataset_name)
    print(dataset_name)
    for i in range(0, int(len(dataset)/1440*0.8)-n_days+1): # select among the first 80% of data
        print(i, "{}:{}".format(i*1440, (i+n_days)*1440))
        data_samples['raw'][dataset_name].append(dataset[i*1440:(i+n_days)*1440])

print("preparing time windows...")

for dataset_name in used_datasets:
    for i in range(0, len(data_samples['raw'][dataset_name])):
        for window_size in [15, 30, 60]:
            # divide 8 days into 5 days for training and 3 days for validation
            x_train, y_train, x_val, y_val = prepare_data(x=data_samples['raw'][dataset_name][i]['CO2'].values, 
                                                          y=data_samples['raw'][dataset_name][i]['Occupancy'].values,
                                                          window_size=window_size, 
                                                          max_batch_size=128,
                                                          splitAt=0.625)
        
            data_samples[window_size][dataset_name].append((x_train, y_train, x_val, y_val))

### Save

In [4]:
# Save to pickle file
path = '../../1_Data/'

with open(path + 'data_samples_for_hyperparametertuning.pkl', 'wb') as file:
    pickle.dump(data_samples, file)

In [5]:
# Load from pickle file

with open(path + 'data_samples_for_hyperparametertuning.pkl', 'rb') as file:
    data_samples = pickle.load(file)

In [6]:
data_samples.keys()

dict_keys([15, 30, 60, 'raw'])

In [7]:
data_samples[15].keys()

dict_keys(['Candanedo', 'Office_A', 'Office_B', 'Home', 'Stjelja'])