In [1]:
import numpy as np
import pandas as pd

from data_formatters.electricity import ElectricityFormatter
from util.data import extract_cols_from_data_type, batch_sampled_data, batch_data

import data_formatters.base
InputTypes = data_formatters.base.InputTypes

In [2]:
np.random.seed(42)
data_formatter = ElectricityFormatter()
raw_data = pd.read_csv('./dataset/data/electricity/hourly_electricity.csv', index_col=0)

In [3]:
train, valid, test = data_formatter.split_data(raw_data)

Formatting train-valid-test splits.
Setting scalers with training data...


In [4]:
train_samples, valid_samples = data_formatter.get_num_samples_for_calibration()

In [5]:
fixed_params = data_formatter.get_experiment_params()
params = data_formatter.get_default_model_params()

In [6]:
fixed_params['column_definition']

[('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>),
 ('power_usage', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>),
 ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('day_of_week', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('categorical_id', <DataTypes.CATEGORICAL: 1>, <InputTypes.STATIC_INPUT: 3>)]

In [7]:
fixed_params

{'total_time_steps': 192,
 'num_encoder_steps': 168,
 'num_epochs': 100,
 'early_stopping_patience': 5,
 'multiprocessing_workers': 5,
 'column_definition': [('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>),
  ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>),
  ('power_usage', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>),
  ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
  ('day_of_week', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
  ('hours_from_start',
   <DataTypes.REAL_VALUED: 0>,
   <InputTypes.KNOWN_INPUT: 2>),
  ('categorical_id',
   <DataTypes.CATEGORICAL: 1>,
   <InputTypes.STATIC_INPUT: 3>)],
 'input_size': 5,
 'output_size': 1,
 'category_counts': [369],
 'input_obs_loc': [0],
 'static_input_loc': [4],
 'known_regular_inputs': [1, 2, 3],
 'known_categorical_inputs': [0]}

In [16]:
train_data = batch_sampled_data(train, max_samples=train_samples, time_steps=fixed_params['total_time_steps'], num_encoder_steps=fixed_params['num_encoder_steps'],
                          input_size=fixed_params['input_size'], output_size=fixed_params['output_size'], column_definition=fixed_params['column_definition'])

Getting valid sampling locations.
Getting locations for MT_001
Getting locations for MT_002
Getting locations for MT_003
Getting locations for MT_004
Getting locations for MT_005
Getting locations for MT_006
Getting locations for MT_007
Getting locations for MT_008
Getting locations for MT_009
Getting locations for MT_010
Getting locations for MT_011
Getting locations for MT_012
Getting locations for MT_013
Getting locations for MT_014
Getting locations for MT_015
Getting locations for MT_016
Getting locations for MT_017
Getting locations for MT_018
Getting locations for MT_019
Getting locations for MT_020
Getting locations for MT_021
Getting locations for MT_022
Getting locations for MT_023
Getting locations for MT_024
Getting locations for MT_025
Getting locations for MT_026
Getting locations for MT_027
Getting locations for MT_028
Getting locations for MT_029
Getting locations for MT_030
Getting locations for MT_031
Getting locations for MT_032
Getting locations for MT_033
Getting l

In [17]:
train_data.keys()

dict_keys(['inputs', 'outputs', 'active_entries', 'time', 'identifier'])

In [44]:
np.save('./dataset/data/electricity/process_train_inputs.npy', train_data['inputs'])
np.save('./dataset/data/electricity/process_train_outputs.npy', train_data['outputs'])
np.save('./dataset/data/electricity/process_train_active_entries.npy', train_data['active_entries'])
np.save('./dataset/data/electricity/process_train_time.npy', train_data['time'])
np.save('./dataset/data/electricity/process_train_identifier.npy', train_data['identifier'])

In [45]:
valid_data = batch_sampled_data(valid, max_samples=valid_samples, time_steps=fixed_params['total_time_steps'], num_encoder_steps=fixed_params['num_encoder_steps'],
                          input_size=fixed_params['input_size'], output_size=fixed_params['output_size'], column_definition=fixed_params['column_definition'])

Getting valid sampling locations.
Getting locations for MT_001
Getting locations for MT_002
Getting locations for MT_003
Getting locations for MT_004
Getting locations for MT_005
Getting locations for MT_006
Getting locations for MT_007
Getting locations for MT_008
Getting locations for MT_009
Getting locations for MT_010
Getting locations for MT_011
Getting locations for MT_012
Getting locations for MT_013
Getting locations for MT_014
Getting locations for MT_015
Getting locations for MT_016
Getting locations for MT_017
Getting locations for MT_018
Getting locations for MT_019
Getting locations for MT_020
Getting locations for MT_021
Getting locations for MT_022
Getting locations for MT_023
Getting locations for MT_024
Getting locations for MT_025
Getting locations for MT_026
Getting locations for MT_027
Getting locations for MT_028
Getting locations for MT_029
Getting locations for MT_030
Getting locations for MT_031
Getting locations for MT_032
Getting locations for MT_033
Getting l

In [46]:
np.save('./dataset/data/electricity/process_valid_inputs.npy', valid_data['inputs'])
np.save('./dataset/data/electricity/process_valid_outputs.npy', valid_data['outputs'])
np.save('./dataset/data/electricity/process_valid_active_entries.npy', valid_data['active_entries'])
np.save('./dataset/data/electricity/process_valid_time.npy', valid_data['time'])
np.save('./dataset/data/electricity/process_valid_identifier.npy', valid_data['identifier'])

In [8]:
test_data = batch_data(test, time_steps=fixed_params['total_time_steps'], num_encoder_steps=fixed_params['num_encoder_steps'], column_definition=fixed_params['column_definition'])

In [9]:
test_data.keys()

dict_keys(['identifier', 'time', 'outputs', 'inputs', 'active_entries'])

In [13]:
test_data['identifier'][-1,:]

array([['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_370'],
       ['MT_

In [14]:
np.save('./dataset/data/electricity/process_test_inputs.npy', test_data['inputs'])
np.save('./dataset/data/electricity/process_test_outputs.npy', test_data['outputs'])
np.save('./dataset/data/electricity/process_test_active_entries.npy', test_data['active_entries'])
np.save('./dataset/data/electricity/process_test_time.npy', test_data['time'])
np.save('./dataset/data/electricity/process_test_identifier.npy', test_data['identifier'])