In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
import torch
import os
from utils import make_sequences

In [2]:
# load processed data
scaler = torch.load('../data/processed/scaler_all.pt')
train_dfs = torch.load('../data/processed/train_dfs.pt')
val_dfs = torch.load('../data/processed/val_dfs.pt')
test_df = torch.load('../data/processed/test_df.pt')
df_naive = (pd.read_csv('../data/processed/naive_model_final.csv', index_col=0, parse_dates=True) - scaler.mean_[:-1])/scaler.scale_[:-1]
df_naive_test_probabilistic_lower_bound = (pd.read_csv('../data/processed/naive_test_probabilistic_lower_bound_final.csv', index_col=0, parse_dates=True) - scaler.mean_[:-1])/scaler.scale_[:-1]
df_naive_test_probabilistic_upper_bound = (pd.read_csv('../data/processed/naive_test_probabilistic_upper_bound_final.csv', index_col=0, parse_dates=True) - scaler.mean_[:-1])/scaler.scale_[:-1]
df_naive_train_val_probabilistic_lower_bound = (pd.read_csv('../data/processed/naive_train_val_probabilistic_lower_bound_final.csv', index_col=0, parse_dates=True) - scaler.mean_[:-1])/scaler.scale_[:-1]
df_naive_train_val_probabilistic_upper_bound = (pd.read_csv('../data/processed/naive_train_val_probabilistic_upper_bound_final.csv', index_col=0, parse_dates=True) - scaler.mean_[:-1])/scaler.scale_[:-1]
DMAs = test_df.columns.tolist()[:10]


In [3]:
def naive_val_indexer(indexlist, df, dma):

    # function puts the naive model predictions in the tensor format [n, 24], where n is the number of samples
    min_index = indexlist[0]
    max_index = indexlist[-1]
    return torch.Tensor(df.loc[min_index:max_index, dma].copy().to_numpy())

In [4]:
# make the one-hot encoding for the DMAs
classes = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
dict_one_hot = {}
for col in DMAs:
    col_name = col[4:5]
    for c in classes:
        if col_name in c:
            ind = classes.index(c)
            one_hot = np.zeros(len(classes))
            one_hot[ind] = 1
            dict_one_hot[col] = one_hot
df_one_hot = pd.DataFrame(dict_one_hot, index=classes)

In [None]:
# make sequences for each dma, dict version
name = '24h_out_all_no_weather'
os.makedirs(f'../data/sequences/{name}', exist_ok=True)

# placeholder values
num_val_seqs = []
num_train_seqs = []
num_test_seqs = []
train_xs, train_ys, train_indices = [], torch.tensor([]), []
val_xs, val_ys, val_indices =  [], torch.tensor([]), []
test_xs, test_ys, test_indices =  [], torch.tensor([]), []

# iterate over the DMAs and turn them into sequences
for i, dma in enumerate(DMAs):
    dma_name = dma[4:5]
    print(f'Processing {dma_name} ({i+1}/{len(DMAs)})')
    
    static_feature = df_one_hot[dma].to_numpy()

    # config of the sequences
    dict_config_sequences = {
        'historic_sequence_length':     168,
        'prediction_sequence_length':   24,
        'historic_features':            None,
        'future_features':              ['Holidays'],
        'future_one_hots':              ['Weekday', 'Hour'],
        'historic_one_hots':            ['Weekday', 'Hour'],
        'target_feature':               dma,
        'static_features':              static_feature,
        'include_historic_target':      True,
        'return_indices':               True,
    }

    # make placeholders
    train_dma_x, train_dma_y, train_dma_ind =  [],  torch.tensor([]), []
    num_train_s_in_loop = []

    # make sequences for each of the splits that contain the training data
    for train_df in train_dfs:
        train_x, train_y, train_ind = make_sequences(
            train_df,
            **dict_config_sequences
            )
               
        train_dma_x.extend(train_x)
        train_dma_y = torch.cat((train_dma_y, train_y))
        train_dma_ind.extend(train_ind)
        num_train_s_in_loop.append(len(train_x))
    num_train_seqs.append(num_train_s_in_loop)
    
    # save train sequences
    torch.save(train_dma_x, f'../data/sequences/{name}/train_x_{name}_{dma_name}.pt')
    torch.save(train_dma_y, f'../data/sequences/{name}/train_y_{name}_{dma_name}.pt')
    torch.save(train_dma_ind, f'../data/sequences/{name}/train_ind_{name}_{dma_name}.pt')

    train_xs.extend(train_dma_x)
    train_ys = torch.cat((train_ys, train_dma_y))
    train_indices.extend(train_dma_ind)

    # make sequences for each of the splits that contain the validation data
    val_dma_x, val_dma_y, val_dma_ind = [],  torch.tensor([]), []   
    num_val_s_in_loop = []
    for val_df in val_dfs:
        val_x, val_y, val_ind = make_sequences(
            val_df,
            **dict_config_sequences
            )

        val_dma_x.extend(val_x)
        val_dma_y = torch.cat((val_dma_y, val_y))
        val_dma_ind.extend(val_ind)
        num_val_s_in_loop.append(len(val_x))

    num_val_seqs.append(num_val_s_in_loop)
    val_dma_naive2 = torch.stack([naive_val_indexer(test_index, df_naive, dma) for test_index in val_dma_ind])
    val_dma_naive_up = torch.stack([naive_val_indexer(test_index, df_naive_train_val_probabilistic_upper_bound, dma) for test_index in val_dma_ind])
    val_dma_naive_low = torch.stack([naive_val_indexer(test_index, df_naive_train_val_probabilistic_lower_bound, dma) for test_index in val_dma_ind])

    # save val sequences
    torch.save(val_dma_x, f'../data/sequences/{name}/val_x_{name}_{dma_name}.pt')
    torch.save(val_dma_y, f'../data/sequences/{name}/val_y_{name}_{dma_name}.pt')
    torch.save(val_dma_ind, f'../data/sequences/{name}/val_ind_{name}_{dma_name}.pt')
    torch.save(val_dma_naive2, f'../data/sequences/{name}/val_naive_{name}_{dma_name}.pt')
    torch.save(val_dma_naive_up, f'../data/sequences/{name}/val_naive_up_{name}_{dma_name}.pt')
    torch.save(val_dma_naive_low, f'../data/sequences/{name}/val_naive_low_{name}_{dma_name}.pt')

    val_xs.extend(val_dma_x)
    val_ys = torch.cat((val_ys, val_dma_y))
    val_indices.extend(val_dma_ind)

    # make sequences for the test data
    test_x, test_y, test_ind = make_sequences(
        test_df,
        **dict_config_sequences
        )
    num_test_seqs.append(len(test_x))
    
    test_naive_2 = torch.stack([naive_val_indexer(test_index, df_naive, dma) for test_index in test_ind])
    test_naive_up = torch.stack([naive_val_indexer(test_index, df_naive_test_probabilistic_upper_bound, dma) for test_index in test_ind])
    test_naive_low = torch.stack([naive_val_indexer(test_index, df_naive_test_probabilistic_lower_bound, dma) for test_index in test_ind])

    # save test sequences
    torch.save(test_x, f'../data/sequences/{name}/test_x_{name}_{dma_name}.pt')
    torch.save(test_y, f'../data/sequences/{name}/test_y_{name}_{dma_name}.pt')
    torch.save(test_ind, f'../data/sequences/{name}/test_ind_{name}_{dma_name}.pt')
    torch.save(test_naive_2, f'../data/sequences/{name}/test_naive_final_{name}_{dma_name}.pt')
    torch.save(test_naive_up, f'../data/sequences/{name}/test_naive_up_final_{name}_{dma_name}.pt')
    torch.save(test_naive_low, f'../data/sequences/{name}/test_naive_low_final_{name}_{dma_name}.pt')

    test_xs.extend(test_x)
    test_ys = torch.cat((test_ys, test_y))
    test_indices.extend(test_ind)

torch.save(train_xs, f'../data/sequences/{name}/train_x_{name}_full_sequence.pt')
torch.save(train_ys, f'../data/sequences/{name}/train_y_{name}_full_sequence.pt')
torch.save(train_indices, f'../data/sequences/{name}/train_ind_{name}_full_sequence.pt')

torch.save(val_xs, f'../data/sequences/{name}/val_x_{name}_full_sequence.pt')
torch.save(val_ys, f'../data/sequences/{name}/val_y_{name}_full_sequence.pt')
torch.save(val_indices, f'../data/sequences/{name}/val_ind_{name}_full_sequence.pt')

torch.save(test_xs, f'../data/sequences/{name}/test_x_{name}_full_sequence.pt')
torch.save(test_ys, f'../data/sequences/{name}/test_y_{name}_full_sequence.pt')
torch.save(test_indices, f'../data/sequences/{name}/test_ind_{name}_full_sequence.pt')


Processing A (1/10)
Processing B (2/10)
