In [None]:
import os
import pandas as pd
import numpy as np
import h5py
import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import sys
sys.path.append("../")
from utils.DataPreparation import prepare_data
from utils.DataPreparation import scale_data

## Read Data

In [None]:
hdf5_file = None  # INSERT FILE PATH TO THE DATASET HERE
output_file_path = None # INSERT FILE PATH WHERE TO STORE THE PREPARED DATA

data      = dd.read_hdf(hdf5_file, 'data', chunksize=14400, mode='r')
metadata  = pd.read_hdf(hdf5_file, 'metadata')

## Prepare Data Samples for Training and Validation

In [None]:
def append_to_h5(path, dataset_name, data_to_append):
    try:
        with h5py.File(path, 'a') as h5_file:
            dataset = h5_file[dataset_name]
            current_shape = dataset.shape
            new_shape = (current_shape[0] + data_to_append.shape[0],) + current_shape[1:]
            dataset.resize(new_shape)
            new_slice = (slice(current_shape[0], new_shape[0]),) + (slice(None),) * (len(new_shape) - 1)
            dataset[new_slice] = data_to_append
    except KeyError:
        # Create new dataset if it doesn't exist yet
        with h5py.File(path, 'a') as h5_file:
            h5_file.create_dataset(dataset_name, data=data_to_append, chunks=True, maxshape=(None,) + data_to_append.shape[1:])

In [None]:
def scale_meta_features(meta_features):
    '''
    :param meta_features: feature array [_volume, infiltration, maxOccupants]
    '''
    meta_features[0] = scale_data(meta_features[0], min_domain=9.6, max_domain=400)
    meta_features[1] = scale_data(meta_features[1], min_domain=0.000085, max_domain=0.00085)
    meta_features[2] = scale_data(meta_features[2], min_domain=1, max_domain=12)
    return meta_features

In [None]:
validationRooms = 1000 # number of rooms used for validation
x_train = None
x_val = None
n_simulations = len(metadata['simID'].unique())
print("data from {} simulations are prepared...".format(n_simulations))

for i in range(0, n_simulations):
    
    # prepare timeseries
    df = data[data['simID'] == i]
   
    x_part, y_part = prepare_data(data.partitions[i]['Zone Air CO2 Concentration'].values.compute(), 
                                  data.partitions[i]['BinaryOccupancy'].values.compute(),
                                  window_size=30, 
                                  max_batch_size=128,
                                  normalize='CO2')
    
    # prepare metadata
    df_meta = metadata[metadata['simID'] == i]
    meta_features = scale_meta_features(df_meta[['_volume', 'infiltration', 'maxOccupants']].values[0])
    x_meta_part = np.array([meta_features for d in range(0, len(x_part))])
    
    print("saving to file...")
    # save to file
    if i < n_simulations - validationRooms:
        append_to_h5(output_file_path, 'x_train_timeseries', x_part.astype('float16'))
        append_to_h5(output_file_path, 'x_train_metadata', x_meta_part.astype('float16'))
        append_to_h5(output_file_path, 'y_train', y_part.astype('float16'))
    else:
        append_to_h5(output_file_path, 'x_val_timeseries', x_part.astype('float16'))
        append_to_h5(output_file_path, 'x_val_metadata', x_meta_part.astype('float16'))
        append_to_h5(output_file_path, 'y_val', y_part.astype('float16'))
            
    print("prepared data from {} simulations".format(i+1))
    

In [None]:
with h5py.File(output_file_path, 'r') as h5_file:
    print(h5_file['x_train_timeseries'])
    print(h5_file['x_train_metadata'])
    print(h5_file['y_train'])
    print(h5_file['x_val_timeseries'])
    print(h5_file['x_val_metadata'])
    print(h5_file['y_val'])

## Clip Outliers

In [None]:
# clip values to range [0, 1] -> do not allow CO2 values above 5000 ppm

def clip(a, min_value, max_value):
    return np.where(a < 0.0, min_value, np.where(a > 1.0, max_value, a))

def removeExtremeValues(filepath, datasetname):
    with h5py.File(filepath, 'r+') as h5_file:
        with ProgressBar():
            dask_array = da.from_array(h5_file[datasetname], chunks=1000000)
            i = 0
            for j in dask_array.chunks[0]:
                print(i, i+j)
                h5_file[datasetname][i:i+j] = clip(dask_array[i:i+j].compute(), 0.0, 1.0)
                i += j
                
removeExtremeValues(output_file_path, 'x_val_timeseries')
removeExtremeValues(output_file_path, 'x_train_timeseries')