### Loading libraries

In [1]:
import sys
import os
sys.path.insert(1, '..')
os.chdir('..')

from data_formatters.dubosson2018 import *
from dataset import TSDataset
from conf import Conf

### Code walk-through

The major parts of the code that need to be defined for each data set are:
1. config file in `.yaml` format,
2. data formatter script.

For now, you can study the `electricity.yaml` example for a look of what a config file should feel like. You can skip the hyperparam defintions and the model parameters. The main focus would be on defining the dataset parameters. 

We do not intereact with `.yaml` in a direct way but instead though `Conf` class, which handles the following:
1. defines some defaults if not specified in `.yaml`,
2. sets save paths,
3. allows for nice colored printing.

Technically, we could doo all of this in the `.yaml` file directly. However, then every time we re-run the experiment, we would have to manually modify the `.yaml` file to reset save paths and redefine some variables, which would be inconvenient.  


In [2]:
# loading the config file, setting the experiment name, and the seed for random pre-processing parts (like splitting)
cnf = Conf(conf_file_path='./conf/dubosson.yaml', seed=15, exp_name="Dubosson", log=False)

In [3]:
# lets print out the config file
print(f'\nDefault configuration parameters: \n{cnf}')


Default configuration parameters: 
[34mLR[0m[31m: [0m[35m0.001[0m
[34mEPOCHS[0m[31m: [0m[35m20[0m
[34mN_WORKERS[0m[31m: [0m[35m0[0m
[34mBATCH_SIZE[0m[31m: [0m[35m64[0m
[34mQUANTILES[0m[31m: [0m[35m[0.1, 0.5, 0.9][0m
[34mDS_NAME[0m[31m: [0m[33mdubosson2018[0m
[34mALL_PARAMS[0m[31m: [0m[35m{'ds_name': 'dubosson2018', 'data_csv_path': './raw_data/Dubosson2018_processed.csv', 'index_col': -1, 'total_time_steps': 192, 'num_encoder_steps': 168, 'max_samples': 5000, 'batch_size': 64, 'device': 'cuda', 'lr': 0.001, 'num_epochs': 20, 'n_workers': 0, 'model': 'transformer', 'loader': 'base', 'quantiles': [0.1, 0.5, 0.9], 'batch_first': True, 'early_stopping_patience': 5, 'hidden_layer_size': 160, 'stack_size': 1, 'dropout_rate': 0.1, 'max_gradient_norm': 0.01, 'num_heads': 4, 'd_model': 64, 'q': 16, 'v': 16, 'h': 4, 'N': 2, 'attention_size': 0, 'dropout': 0.1, 'pe': 'original', 'chunk_mode': 'None', 'd_input': 5, 'd_output': 3}[0m
[34mEXP_LOG_PATH[0m

Now let's move on to the data formatter. This is the part that should handle:
1. loading the data and setting types,
2. splitting the data into train / val / test sets,
3. setting scalers and encoders for numerical / categorical variables resp.

We are going to leave parts 2-3 for the future exploration. Now, let's focus on loading and settting the types for the data. 

In [4]:
# call the data fromatter directly
data_formatter = DubossonFormatter()

Finally, let's work with the `TSDataset` class. This is the main part of the code as it aligns all of our previous steps. In the end, it is the `TSDataset` that is going to call the splitters, scalers, and encoders. **Importatnly** the model is only going to interact with the data through this class. 

In [5]:
# we are going to pass our data formatter and the config file to the TSDataset class
dataset = TSDataset(cnf, data_formatter)

Getting valid sampling locations.
# available segments=6228
Extracting 5000 samples out of 6228
1000 of 5000 samples done...
2000 of 5000 samples done...
3000 of 5000 samples done...
4000 of 5000 samples done...
5000 of 5000 samples done...


In [6]:
# now let's see how we can sample minibatches from our dataset that we can then pass to the model to train on
for i in range(10):
    # 192 x ['power_usage', 'hour', 'day_of_week', 'hours_from_start', 'categorical_id']
    x = dataset[i]['inputs']
    # 24 x ['power_usage']
    y = dataset[i]['outputs']
    print(f'Example #{i}: x.shape={x.shape}, y.shape={y.shape}')

Example #0: x.shape=(192, 2), y.shape=(192, 1)
Example #1: x.shape=(192, 2), y.shape=(192, 1)
Example #2: x.shape=(192, 2), y.shape=(192, 1)
Example #3: x.shape=(192, 2), y.shape=(192, 1)
Example #4: x.shape=(192, 2), y.shape=(192, 1)
Example #5: x.shape=(192, 2), y.shape=(192, 1)
Example #6: x.shape=(192, 2), y.shape=(192, 1)
Example #7: x.shape=(192, 2), y.shape=(192, 1)
Example #8: x.shape=(192, 2), y.shape=(192, 1)
Example #9: x.shape=(192, 2), y.shape=(192, 1)


In [7]:
df = dataset.data
df[df.id == 7]

Unnamed: 0,id,segment,time,gl,segment_id
5810,7,1,2014-10-01 11:45:00,,7_1
5811,7,1,2014-10-01 11:50:00,69.12,7_1
5812,7,1,2014-10-01 11:55:00,72.72,7_1
5813,7,1,2014-10-01 12:00:00,76.68,7_1
5814,7,1,2014-10-01 12:05:00,82.44,7_1
...,...,...,...,...,...
6794,7,2,2014-10-05 03:20:00,199.80,7_2
6795,7,2,2014-10-05 03:25:00,198.00,7_2
6796,7,2,2014-10-05 03:30:00,196.20,7_2
6797,7,2,2014-10-05 03:35:00,194.40,7_2


In [8]:
### Construct list of segment indices by subject ###

L = dict()
segment = []
for i in range(len(df)):
    # get row information
    row = df.iloc[i]
    prev_row = df.iloc[0] if i == 0 else df.iloc[i-1]
    
    # check for change in subject or segment
    if row.id != prev_row.id or row.segment != prev_row.segment:
        if prev_row.id not in L:
            L[prev_row.id] = [segment]
        else:
            L[prev_row.id].append(segment)
        segment = []
    
    segment.append(i)
    
    # edge case: once at end of data, need to append final segment
    if i == len(df)-1:
        if prev_row.id not in L:
            L[prev_row.id] = [segment]
        else:
            L[prev_row.id].append(segment)

# Access subject segment indices using L[1][0] for subject 1 segment 1..., L[7][1] for subject 7 segment 2...

In [9]:
### Count number of segments ###
print("Segments per Subject")
for subject in L:
    num_segments = len(L[subject])
    print(f"Subject {subject}: {num_segments}")
    
print()
    
print("Points per Segment")
for subject in L:
    for i in range(len(L[subject])):
        total_pts = len(L[subject][i])
        print(f"Subject {subject} (Segment {i+1}): {total_pts}")

Segments per Subject
Subject 1: 1
Subject 2: 1
Subject 3: 1
Subject 4: 1
Subject 5: 1
Subject 6: 1
Subject 7: 2
Subject 8: 1

Points per Segment
Subject 1 (Segment 1): 1413
Subject 2 (Segment 1): 1056
Subject 3 (Segment 1): 183
Subject 4 (Segment 1): 969
Subject 5 (Segment 1): 909
Subject 6 (Segment 1): 1280
Subject 7 (Segment 1): 612
Subject 7 (Segment 2): 377
Subject 8 (Segment 1): 1140


In [10]:
import math

### Split Function ###
# length segments: 12 hours of observation at 5 minutes frequency
# 12 * 60 = 720 minutes
# 720 / 5 = 144 5 minute intervals = 144 points
_split_params = {
  'test_percent_subjects': 0.1,
  'test_length_segment': 144,
  'val_length_segment': 144,
  'min_drop_length': 144
}

# Get size of test set
test_set = []
number_of_subject = len(L)
test_count = math.ceil(number_of_subject * _split_params['test_percent_subjects'])

# add test set data
for i in range(test_count):
    subjects = list(L.keys())
    curr_subject = L[subjects[i]]
    for segment in curr_subject:
        for idx in segment:
            test_set.append(idx)

# remove test set from L
subjects = list(L.keys())
for i in range(test_count):
    del L[subjects[i]]

In [11]:
train_set = []
validation = []

# iterate through L
# check that segment has length >= min_drop_length + val_length_segment + test_length_segment
for subject in L:
    for segment in L[subject]:
        if len(segment) >= _split_params['min_drop_length'] + _split_params['val_length_segment'] + _split_params['test_length_segment']:
            train_set.append(segment[:(-(_split_params['val_length_segment'] + _split_params['test_length_segment']))])
            validation.append(segment[-(_split_params['val_length_segment'] + _split_params['test_length_segment']):-_split_params['test_length_segment']])
            test_set.append(segment[-_split_params['test_length_segment']:])
        else:
            train_set.append(segment)

In [12]:
# Flatten sets
def flatten(l):
    return [item for sublist in l for item in sublist]

train_set = flatten(train_set)
validation = flatten(validation)

# Add to dictionary for access
datasets = {'test': test_set, 'train': train_set, 'validation': validation}

In [13]:
print(len(datasets['test']))
print(len(datasets['train']))
print(len(datasets['validation']))

1419
4798
864
