In [2]:
import pandas as pd
import numpy as np
from utils import make_sequences
from sklearn.preprocessing import StandardScaler
import torch
import matplotlib.pyplot as plt


In [3]:
# Load Data
df_inflow = pd.read_excel(r'../data/raw/WDSA/Inflow_Data_4.xlsx', index_col=0, parse_dates=True, date_format='%d/%m/%Y %H:%M')
DMAs = df_inflow.columns.to_list()
df_holidays = pd.read_csv(r'../data/raw/WDSA/holidays.csv', index_col=0, parse_dates=True)
df_holidays = df_holidays.resample('h').ffill()
df_inflow = df_inflow.merge(df_holidays, left_index=True, right_index=True, how='left')
df_inflow.index = df_inflow.index.tz_localize('CET', ambiguous="infer").tz_convert('UTC')
df_train_val = df_inflow[:'2022-03-05']
df_test = df_inflow['2022-03-6':]

In [4]:
# make splits
split_len_days = 21

val_split_1_start = pd.Timestamp('2021-05-01', tz='UTC')
val_split_1_end = val_split_1_start + pd.Timedelta(days=split_len_days)
val_split_2_start = pd.Timestamp('2021-08-01', tz='UTC')
val_split_2_end = val_split_2_start + pd.Timedelta(days=split_len_days)
val_split_3_start = pd.Timestamp('2021-11-01', tz='UTC')
val_split_3_end = val_split_3_start + pd.Timedelta(days=split_len_days)
val_split_4_start = pd.Timestamp('2022-02-01', tz='UTC')
val_split_4_end = val_split_4_start + pd.Timedelta(days=split_len_days)


val_splits = [
    (val_split_1_start, val_split_1_end),
    (val_split_2_start, val_split_2_end),
    (val_split_3_start, val_split_3_end),
    (val_split_4_start, val_split_4_end)
]


train_splits = [
    (df_train_val.index[0], val_split_1_start),
    (val_split_1_end, val_split_2_start),
    (val_split_2_end, val_split_3_start),
    (val_split_3_end, val_split_4_start),
    (val_split_4_end, df_train_val.index[-1])
]

train_dfs = []
val_dfs = []

for train_split in train_splits:
    train_dfs.append(df_train_val[train_split[0]:train_split[1]])

for val_split in val_splits:
    val_dfs.append(df_train_val[val_split[0]:val_split[1]])

train_dfs_merged = pd.concat(train_dfs)
val_dfs_merged = pd.concat(val_dfs)

# get train and val data scaled
scaler = StandardScaler()
scaler.fit(train_dfs_merged)

for i, train_df in enumerate(train_dfs):
    train_df = pd.DataFrame(scaler.transform(train_df), columns=train_df.columns, index=train_df.index)
    train_dfs[i] = train_df.interpolate(limit=3)

for i, val_df in enumerate(val_dfs):
    val_df = pd.DataFrame(scaler.transform(val_df), columns=val_df.columns, index=val_df.index)
    val_dfs[i] = val_df.interpolate(limit=3)

df_test_scaled = pd.DataFrame(scaler.transform(df_test), columns=df_test.columns, index=df_test.index)

# # save datasets to disk
torch.save(train_dfs, '../data/processed/train_dfs.pt')
torch.save(val_dfs, '../data/processed/val_dfs.pt')
torch.save(df_test_scaled, '../data/processed/test_df.pt')

# save scalers to disk
torch.save(scaler, '../data/processed/scaler_all.pt')

In [None]:

# see how many sequences we can make with the current splits

val_xs = []
train_xs = []
DMAs = val_dfs[0].columns.to_list()
test_xs = []
for dma in DMAs:
    train_dma_x, train_dma_y, train_dma_ind =  [],  torch.tensor([]), []
    for train_df in train_dfs:
        train_x, train_y, train_ind = make_sequences(
            train_df,
            historic_sequence_length=168,
            prediction_sequence_length=24,
            historic_features=None,
            future_features=None,
            future_one_hots=None,
            historic_one_hots=None,
            target_feature=dma,
            static_features=None,
            include_historic_target=True,
            return_indices=True,
            )
        
        print('---------------train--------------------')
        print('split: ', i+1 ,' ', dma, ' len' , end='')
        print(len(train_x))

        print('split1 start: ', train_df.index[0], 'split1 end: ', train_df.index[-1])
        print('-----------------------------------')
        
        train_dma_x.extend(train_x)
        train_dma_y = torch.cat((train_dma_y, train_y))
        train_dma_ind.extend(train_ind)
    
    train_xs.extend(train_dma_x)
    val_dma_x, val_dma_y, val_dma_ind =  [],  torch.tensor([]), []

    for i, val_df in enumerate(val_dfs):
        val_x, val_y, val_ind = make_sequences(
            val_df,
            historic_sequence_length=168,
            prediction_sequence_length=24,
            historic_features=None,
            future_features=None,
            future_one_hots=None,
            historic_one_hots=None,
            target_feature=dma,
            static_features=None,
            include_historic_target=True,
            return_indices=True,
            )
        
        print('----------------val-------------------')
        print('split: ', i+1 ,' ', dma, ' len' , end='')
        print(len(val_x))

        print('split1 start: ', val_df.index[0], 'split1 end: ', val_df.index[-1])
        print('-----------------------------------')
        
        val_dma_x.extend(val_x)
        val_dma_y = torch.cat((val_dma_y, val_y))
        val_dma_ind.extend(val_ind)
    val_xs.extend(val_dma_x)

    test_x, test_y, test_ind = make_sequences(
        df_test_scaled,
            historic_sequence_length=168,
            prediction_sequence_length=24,
            historic_features=None,
            future_features=None,
            future_one_hots=None,
            historic_one_hots=None,
            target_feature=dma,
            static_features=None,
            include_historic_target=True,
            return_indices=True,
    )
    test_xs.extend(test_x)
    print('----------------test-------------------')
    print('split: ', i+1, ' ', dma, ' len', end='')
    print(len(test_x))
    print('split1 start: ', df_test_scaled.index[0], 'split1 end: ', df_test_scaled.index[-1])
    print('----------------------------------------')


len(train_xs), len(val_xs), len(test_xs)

In [None]:
total_length = len(train_xs)+ len(val_xs)+ len(test_xs)
print(r'percentage train: {}, percentage val: {}, percentage test: {}'.format(len(train_xs)/total_length*100, len(val_xs)/total_length*100, len(test_xs)/total_length*100))