In [1]:
import numpy as np

import sys
import time
sys.path.append("./Models")
import os
os.system('')

import subprocess
import torch
from torch.utils.data import Dataset

import pickle
import pgzip
import copy

import datetime

import math
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt

In [2]:
#Default settings for experiment
arg_model = "tsrnn" #Options: 'trfbb', 'tsrnn', 'trfbf'
arg_dset = "lsm" #Datasets -- Spain: 'ree', AEP, DAYTON: 'dyt' London: 'lsm'

attr_dset_smpl_rt = 24 if arg_dset == "AEP" else (48 if arg_dset == "lsm" else 24) #Samples per day. Spain, AEP: 24, London: 48
param_dset_lookback_weeks = 5
param_dset_forecast = 48 
# param_dset_lookback_weeks = 9
# param_dset_forecast = 168 # 3.5days = 168
param_dset_train_stride = 48 #Choose a coprime value to the forecast so all reading frames are eventually considered
param_dset_test_stride = 'same' #tsrnn paper uses 1 week
param_dset_lookback = param_dset_lookback_weeks*7*attr_dset_smpl_rt - param_dset_forecast

param_trf_weather = False


In [3]:
# path='Datasets/LondonSmartMeter' # if under experiments folder
path = '.'
seq_len = param_dset_lookback
pred_horz = param_dset_forecast
weather = param_trf_weather
timestamps = False
weather = False


if 'lsm_dict.pkl.pgz' not in os.listdir(path):
    subprocess.check_call('python ./LondonSmartMeter_hhour.py ./LondonSmartMeter lsm_dict.pkl')
if 'londonWeather.pkl.pgz' not in os.listdir(path):   
    raise ValueError

with pgzip.open(os.path.join(path,'lsm_dict.pkl.pgz'),'rb') as f:
    s_dict = pickle.load(f)
    


In [4]:
# print(s_dict)
print(len(s_dict[2]))
print(len(s_dict))

print(s_dict[2][1].shape)

2
5561
torch.Size([24192])


In [6]:
import pywt
from tqdm.auto import tqdm

# Get the maximum decomposition level
def print_maximal_decom_level(data):
    max_level = pywt.swt_max_level(data)
    print("Maximum decomposition level:", max_level)
    
    return max_level

# SWT functions
def data_preparation(dataset, window, lev):
    da = []
    max_level = print_maximal_decom_level(window)
    for i in tqdm(range(len(dataset)), total=len(dataset), desc="swt"):
        coeffs = pywt.swt(dataset[i], wavelet='db2', level=lev)
        da.append(coeffs);
    return da

def data_reconstruction(dataset):
    da = []
    for i in tqdm(range(len(dataset)), total= len(dataset), desc="iswt"):
#         recon = pywt.iswt(dataset[i,:,:,:].tolist(), 'db2')
        recon = pywt.iswt(dataset[i], 'db2')
#         print(np.array(recon).shape)
        da.append(recon)
#         da.append(recon[0][window-1])
    return da


# Called because iswt cannot accept tolist() dataset
def data_organization(coeffs):
    '''
    Reshape data back to (n,3,2,window_length), where there are 3 tuples of 2 values consisting of 
    coeffs array_like Coefficients list of tuples:
    [(cAn, cDn), ..., (cA2, cD2), (cA1, cD1)]
    '''
    reshape_list = []
    for i in range(len(coeffs)):
        reshape_list.append([])
        for j in range(len(coeffs[0])):
            reshape_list[i].append(tuple(coeffs[i][j]))
            
    return reshape_list

In [7]:
self_has_weather = weather
self_return_timestamps = timestamps

self_weather_dict = None
if self_has_weather:
    with pgzip.open(os.path.join(path,'londonWeather.pkl.pgz'),'rb') as f2:
        weather_dict = pickle.load(f2)

    self_weather_dict = weather_dict

#s_dict is dictionary as follows: { lclid: (start_timestamp,Tensor), ...}

#List to store the dataset indices corresponding to each household
self_household_idxs = [None]*len(s_dict)
#List to hold the split series
self_series = [None]*len(s_dict)
#These two lists will be converted to torch tensors
self_start_times = []
self_pred_starttimes = []
#These two lists stores the datetime format
self_start_times__ = []
#self.pred_starttimes__ = []

index_count = 0

# Enumerate all keys in dictionary -> lclid
for index, lclid in enumerate(s_dict.keys()):    
    start_time, s_tensor = s_dict[lclid]
    #start_time[0] = start_time[0]%400
    
    #pad_amt = (seq_len+pred_horz) - (len(s_tensor)%(seq_len+pred_horz))
    #s_tensor = torch.nn.functional.pad(s_tensor,pad = (pad_amt,0), value = torch.nan)
    
    #Split into chunks of seq_len+pred_horz length each
    s_tensors = s_tensor.split(seq_len+pred_horz)

    #Compute start timestamps for splits
    start_times = [None]*len(s_tensors)
    pred_starttimes = [None]*len(s_tensors)

    for i in range(len(start_times)):
        minutes_delta = 30*i*(seq_len + pred_horz)
        time_delta = datetime.timedelta(minutes=minutes_delta)
        new_start_time = start_time + time_delta
        start_times[i] = new_start_time

    # Reinitialised for computation of prediction start time
    minutes_delta = 30*seq_len
    time_delta = datetime.timedelta(minutes=minutes_delta)

    #Compute prediction start timestamps
    for i in range(len(pred_starttimes)):
        new_predtime = start_times[i] + time_delta
        pred_starttimes[i] = [new_predtime.year,
                              new_predtime.month,
                              new_predtime.day,
                              new_predtime.hour,
                              new_predtime.minute,
                              new_predtime.second]

    ''' 
    A shallow copy, li2, is created using copy.copy(), 
    preserving the top-level structure but sharing references to the inner lists. 
    A deep copy, li3, is created using copy.deepcopy(), resulting in a completely 
    independent copy of li1, including all nested elements
    '''
    
    start_times__ = copy.deepcopy(start_times)
    for i in range(len(start_times)):
        new_start_time = start_times[i]
        start_times[i] = [new_start_time.year,
                          new_start_time.month,
                          new_start_time.day,
                          new_start_time.hour,
                          new_start_time.minute,
                          new_start_time.second]

    #Remove last if length less than the others
    if s_tensors[-1].shape[0] < seq_len+pred_horz:
        s_tensors = s_tensors[:-1]
        start_times = start_times[:-1]
        pred_starttimes = pred_starttimes[:-1]
        start_times__ = start_times__[:-1]

    if len(s_tensors) == 0:
        self_series[index] = torch.empty(0)                    
    else:
        '''
        torch.stack()
        Concatenates a sequence of tensors along a new dimension.
        All tensors need to be of the same size.
        '''
        s_tensors = torch.stack(s_tensors)
        # Remove invalid (more than 4/5 (80%) of series is 0 or nan)
        s_tensors[s_tensors==0] = torch.nan # 0s are invalid too, replace with nan
        # Count those tensor index with less than 80% invalid
        sel = (( (s_tensors==0) | s_tensors.isnan()).sum(dim=-1) < (4*(seq_len+pred_horz)//5))
        s_tensors = s_tensors[sel]
        
        # Get start, prediction times for those with less than 80% nan
        start_times_ = [start_times[i] for i in range(len(start_times)) if sel[i]]
        pred_starttimes_ = [pred_starttimes[i] for i in range(len(pred_starttimes)) if sel[i]]
        _start_times__ = [ start_times__[i] for i in range(len(start_times__)) if sel[i]]
        
        self_series[index] = s_tensors
        if index==0:
            print("before series:", self_series[0].shape)
        self_start_times = self_start_times + start_times_
        if index==0:
            print("start_times_ after: ", start_times_)
            print("start_times after: ", start_times)
        self_pred_starttimes = self_pred_starttimes + pred_starttimes_
        self_start_times__ = self_start_times__ + _start_times__

    if len(s_tensors) == 0:
        self_household_idxs[index] = []
    else:
        self_household_idxs[index] = list(range(index_count,index_count+len(s_tensors)))
        index_count = index_count + len(s_tensors)
        
print("before concat series:", len(self_series))
print("before concat series:", self_series[0].shape)
# Concat all households series but retain 1680 seq_len predictions
self_series = torch.cat(self_series,dim=0).unsqueeze(-1)
print("before normalization series:", self_series.shape)

self_start_times = torch.tensor(self_start_times,dtype = torch.long)
self_pred_starttimes = torch.tensor(self_pred_starttimes,dtype = torch.long)

#self.series[:,:seq_len] = self.series[:,:seq_len].nan_to_num(nan=0.,posinf=0.,neginf=0.)

#Series normalization
smin = self_series.nan_to_num(nan=torch.finfo(self_series.dtype).max).amin(dim=-2,keepdim=True)
smax = self_series.nan_to_num(nan=torch.finfo(self_series.dtype).min).amax(dim=-2,keepdim=True)
# Normalize over dimension -2, which is seq_len + prediction len
self_series = (self_series - smin.broadcast_to(self_series.shape))/(smax-smin+1e-10).broadcast_to(self_series.shape)

print("total series shape: ", self_series.shape)

#SWT Transformation
## INPUT HERE

if self_has_weather:
    #Weather series normalization
    #Only dimensions 0, 3, 4, 5, 6, 7 needs normalization
    wdmin = self_weather_dict['tensor'].nan_to_num(nan=torch.finfo(self_weather_dict['tensor'].dtype).max).amin(dim=-2,keepdim=True)
    wdmax = self_weather_dict['tensor'].nan_to_num(nan=torch.finfo(self_weather_dict['tensor'].dtype).min).amax(dim=-2,keepdim=True)
    self_weather_dict['tensor'] = (self_weather_dict['tensor'] - wdmin.broadcast_to(self_weather_dict['tensor'].shape))\
        /(wdmax-wdmin + 1e-10).broadcast_to(self_weather_dict['tensor'].shape)

    self_weather_dict['tensor'] = self_weather_dict['tensor'].type(torch.float32)

self_length = len(self_series)
self_seq_len = seq_len
self_pred_horz = pred_horz

before series: torch.Size([14, 1680])
start_times_ after:  [[2012, 10, 12, 0, 30, 0], [2012, 11, 16, 0, 30, 0], [2012, 12, 21, 0, 30, 0], [2013, 1, 25, 0, 30, 0], [2013, 3, 1, 0, 30, 0], [2013, 4, 5, 0, 30, 0], [2013, 5, 10, 0, 30, 0], [2013, 6, 14, 0, 30, 0], [2013, 7, 19, 0, 30, 0], [2013, 8, 23, 0, 30, 0], [2013, 9, 27, 0, 30, 0], [2013, 11, 1, 0, 30, 0], [2013, 12, 6, 0, 30, 0], [2014, 1, 10, 0, 30, 0]]
start_times after:  [[2012, 10, 12, 0, 30, 0], [2012, 11, 16, 0, 30, 0], [2012, 12, 21, 0, 30, 0], [2013, 1, 25, 0, 30, 0], [2013, 3, 1, 0, 30, 0], [2013, 4, 5, 0, 30, 0], [2013, 5, 10, 0, 30, 0], [2013, 6, 14, 0, 30, 0], [2013, 7, 19, 0, 30, 0], [2013, 8, 23, 0, 30, 0], [2013, 9, 27, 0, 30, 0], [2013, 11, 1, 0, 30, 0], [2013, 12, 6, 0, 30, 0], [2014, 1, 10, 0, 30, 0]]
before concat series: 5561
before concat series: torch.Size([14, 1680])
before normalization series: torch.Size([97143, 1680, 1])
total series shape:  torch.Size([97143, 1680, 1])


In [8]:
#SWT Transformation
self_series = self_series.squeeze(-1)
print("total series shape after squeeze: ", self_series.shape)
self_series_numpy = self_series.numpy()
print("torch shape: ", self_series.shape)
print("numpy shape: ", self_series_numpy.shape)
print(self_series[0])
print(self_series_numpy[0])
lev = 3
da = data_preparation(self_series_numpy, self_series_numpy.shape[1], lev)
print(da[0][0])

Vv = np.array(da)
print(Vv.shape)
print(Vv[0][0])

vv = Vv.reshape(Vv.shape[0],2*lev*Vv.shape[3])
print(vv.shape)


# dataset = scaler.fit_transform(vv)

dat = vv.reshape(Vv.shape[0],2*lev,Vv.shape[3])
print(dat.shape)

total series shape after squeeze:  torch.Size([97143, 1680])
torch shape:  torch.Size([97143, 1680])
numpy shape:  (97143, 1680)
tensor([   nan,    nan,    nan,  ..., 0.0807, 0.1022, 0.0879])
[       nan        nan        nan ... 0.08074533 0.10224557 0.08791208]
Maximum decomposition level: 4


swt:   0%|          | 0/97143 [00:00<?, ?it/s]

(array([nan, nan, nan, ..., nan, nan, nan], dtype=float32), array([nan, nan, nan, ..., nan, nan, nan], dtype=float32))
(97143, 3, 2, 1680)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(97143, 10080)
(97143, 6, 1680)


In [9]:
dat = dat[1000:1200]

print(dat.shape)
# alpha=0.6667

alpha = 0.6
beta = 0.8


trainX,trainY=dat[:int(dat.shape[0]*alpha),:,:param_dset_lookback],dat[:int(dat.shape[0]*alpha),:,param_dset_lookback:]
valX,valY=dat[int(dat.shape[0]*alpha):int(dat.shape[0]*beta),:,:param_dset_lookback],dat[int(dat.shape[0]*alpha):int(dat.shape[0]*beta),:,param_dset_lookback:]
testX,testY=dat[int(dat.shape[0]*beta):,:,:param_dset_lookback],dat[int(dat.shape[0]*beta):,:,param_dset_lookback:]

# trainY = trainY.reshape([trainY.shape[0], trainY.shape[1]*trainY.shape[2]])
# valY = valY.reshape([valY.shape[0], valY.shape[1]*valY.shape[2]])
# testY = testY.reshape([testY.shape[0], testY.shape[1]*testY.shape[2]])

print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(trainY.shape)

trainX=np.transpose(trainX, (0, 2, 1))
valX =np.transpose(valX, (0, 2, 1))
testX=np.transpose(testX, (0, 2, 1))

print(trainX.shape)
print(valX.shape)
print(testX.shape)


(200, 6, 1680)
(120, 6, 1632)
(40, 6, 1632)
(40, 6, 1632)
(120, 6, 48)
(120, 1632, 6)
(40, 1632, 6)
(40, 1632, 6)


## Model Initialization

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [11]:
class Time2Vector(nn.Module):
    def __init__(self, seq_len):
        super(Time2Vector, self).__init__()
        self.seq_len = seq_len

        self.weights_linear = nn.Parameter(torch.rand(seq_len, requires_grad=True))
        self.bias_linear = nn.Parameter(torch.rand(seq_len), requires_grad=True)
        self.weights_periodic = nn.Parameter(torch.rand(seq_len), requires_grad=True)
        self.bias_periodic = nn.Parameter(torch.rand(seq_len), requires_grad=True)
        
        # Initialize parameters with uniform distribution
        nn.init.uniform_(self.weights_linear, a=0.0, b=1.0)
        nn.init.uniform_(self.bias_linear, a=0.0, b=1.0)
        nn.init.uniform_(self.weights_periodic, a=0.0, b=1.0)
        nn.init.uniform_(self.bias_periodic, a=0.0, b=1.0)

    def forward(self, x):
        x = torch.mean(x[:, :, :], dim=-1)
        time_linear = self.weights_linear * x + self.bias_linear
        time_linear = time_linear.unsqueeze(-1)

        time_periodic = torch.sin(x * self.weights_periodic + self.bias_periodic)
        time_periodic = time_periodic.unsqueeze(-1)

        return torch.cat([time_linear, time_periodic], dim=-1)

    def extra_repr(self):
        return f'seq_len={self.seq_len}'

In [12]:
class SingleAttention(nn.Module):
    def __init__(self, d_k, d_v):
        super(SingleAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v

        self.query = nn.Linear(in_features=8, out_features=d_k)
        nn.init.xavier_uniform_(self.query.weight)
        nn.init.zeros_(self.query.bias)

        self.key = nn.Linear(in_features=8, out_features=d_k)
        nn.init.xavier_uniform_(self.key.weight)
        nn.init.zeros_(self.key.bias)

        self.value = nn.Linear(in_features=8, out_features=d_v)
        nn.init.xavier_uniform_(self.value.weight)
        nn.init.zeros_(self.value.bias)

    def forward(self, inputs):

        q = self.query(inputs[0])
        k = self.key(inputs[1])

        attn_weights = torch.matmul(q, k.transpose(-2, -1))
        attn_weights = attn_weights / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        attn_weights = F.softmax(attn_weights, dim=-1)

        v = self.value(inputs[2])
        attn_out = torch.matmul(attn_weights, v)
        return attn_out
    
class MultiAttention(nn.Module):
    def __init__(self, d_k, d_v, n_heads):
        super(MultiAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        
        self.attn_heads = nn.ModuleList([SingleAttention(d_k, d_v) for _ in range(n_heads)])
        
        self.linear = nn.Linear(d_k * n_heads, 8)
        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)

    def forward(self, inputs):
        attn = [self.attn_heads[i](inputs) for i in range(self.n_heads)]
        concat_attn = torch.cat(attn, dim=-1)
        multi_linear = self.linear(concat_attn)
        return multi_linear

In [13]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_k, d_v, n_heads, ff_dim, seq_len, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.ff_dim = ff_dim
        self.dropout_rate = dropout

        self.attn_multi = MultiAttention(d_k, d_v, n_heads)
        self.attn_dropout = nn.Dropout(dropout)
        self.attn_normalize = nn.LayerNorm(normalized_shape=8, eps=1e-6)

#         self.ff_conv1D_1 = nn.Conv1d(in_channels=8, out_channels=self.ff_dim, kernel_size=1)
        self.ff_conv1D_1 = nn.Conv1d(in_channels=seq_len, out_channels=self.ff_dim, kernel_size=1)
#         self.ff_conv1D_2 = nn.Conv1d(in_channels=self.ff_dim, out_channels=8, kernel_size=1)
        self.ff_conv1D_2 = nn.Conv1d(in_channels=self.ff_dim, out_channels=seq_len, kernel_size=1)
        self.ff_dropout = nn.Dropout(dropout)
        self.ff_normalize = nn.LayerNorm(normalized_shape=8, eps=1e-6)

    def forward(self, inputs):
#         print(len(inputs))
#         print(inputs[0].shape)
        attn_layer = self.attn_multi(inputs)
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(inputs[0] + attn_layer)

        # Correction for transpose
#         ff_layer = self.ff_conv1D_1(attn_layer.transpose(1, 2))
        ff_layer = self.ff_conv1D_1(attn_layer)
        ff_layer = F.relu(ff_layer)
#         ff_layer = self.ff_conv1D_2(ff_layer).transpose(1, 2)
        ff_layer = self.ff_conv1D_2(ff_layer)
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(inputs[0] + ff_layer)
        return ff_layer
    
class TransformerDecoder(nn.Module):
    def __init__(self, d_k, d_v, n_heads, ff_dim, seq_len, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.ff_dim = ff_dim
        self.dropout_rate = dropout

        self.attn_multi = MultiAttention(d_k, d_v, n_heads)
        self.attn_dropout = nn.Dropout(dropout)
        self.attn_normalize = nn.LayerNorm(normalized_shape=8, eps=1e-6)

#         self.ff_conv1D_1 = nn.Conv1d(in_channels=8, out_channels=8, kernel_size=1)
        self.ff_conv1D_1 = nn.Conv1d(in_channels=seq_len, out_channels=seq_len, kernel_size=1)
        self.ff_dropout = nn.Dropout(dropout)
        self.ff_normalize = nn.LayerNorm(normalized_shape=8, eps=1e-6)

    def forward(self, inputs):
        attn_layer = self.attn_multi(inputs)
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(inputs[0] + attn_layer)
        
        # Transpose for pytorch implementation
#         ff_layer = self.ff_conv1D_1(attn_layer.transpose(1, 2)).transpose(1, 2)
        ff_layer = self.ff_conv1D_1(attn_layer)
        ff_layer = F.relu(ff_layer)
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(inputs[0] + ff_layer)
        return ff_layer

In [14]:
class SWT_Transformer(nn.Module):
    def __init__(self, seq_len, inp_len, out_len, d_k, d_v, n_heads, ff_dim, param_dset_forecast):
        super(SWT_Transformer, self).__init__()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.time_embedding = Time2Vector(seq_len)
        
        self.layer1 = TransformerEncoder(d_k, d_v, n_heads, ff_dim, seq_len)
        self.layer2 = TransformerEncoder(d_k, d_v, n_heads, ff_dim, seq_len)
        self.layer3 = TransformerEncoder(d_k, d_v, n_heads, ff_dim, seq_len)
        self.layer4 = TransformerDecoder(d_k, d_v, n_heads, ff_dim, seq_len)
        self.layer5 = TransformerDecoder(d_k, d_v, n_heads, ff_dim, seq_len)

        self.pooling = nn.AdaptiveAvgPool1d(1)
#         self.fc1 = nn.Linear(seq_len, 128)
#         self.fc2 = nn.Linear(128, 48)
        self.fc1 = nn.Linear(seq_len, 512)
        self.fc2 = nn.Linear(512, 6*param_dset_forecast)

    def forward(self, x):
        in_seq = x
        
        time_embedding = self.time_embedding(in_seq)
        x = torch.cat([in_seq, time_embedding], dim=-1)
        
        x = self.layer1((x, x, x))
        x = self.layer2((x, x, x))
        x = self.layer3((x, x, x))
        x = self.layer4((x, x, x))
        x = self.layer5((x, x, x))
#         print("1", x.shape)

        x = self.pooling(x).squeeze(2)
#         print("2", x.shape)
        x = F.dropout(x, p=0.1)
        x = F.relu(self.fc1(x))
#         print("3", x.shape)
        x = F.dropout(x, p=0.1)
        out = self.fc2(x)
#         print("4", out.shape)
        out = out.reshape((out.shape[0],6,out.shape[1]//6))
#         print("5", out.shape)

        return out

## Create model and data tensor

In [15]:
batch_size = 16
d_k = 256
d_v = 256
n_heads = 12
ff_dim = 256
lev=3
inp_len=2*lev
out_len=2*lev
seq_len = param_dset_lookback

# seq_len = 1
# window = 200
# look_back = 12

In [16]:
# model = create_model()
# model.summary()

# # Training data
# X_train, y_train = trainX,trainY
# ###############################################################################
# # Validation data
# X_val, y_val = testX,testY
# ###############################################################################
# # Test data
# X_test, y_test = testX_a,testY_a
# callback = tf.keras.callbacks.ModelCheckpoint('Transformer_5min.hdf5',
#                                                       monitor='val_loss',
#                                                       save_best_only=True,
#                                                       verbose=1)
# with tf.device("/gpu:0"):
#     history = model.fit(X_train, y_train,
#                             batch_size=batch_size,
# #                             epochs=50,
#                             epochs=1,
#                             validation_data=(X_val, y_val),
#                             callbacks=[callback])

# model = tf.keras.models.load_model('Transformer_5min.hdf5',
#                                            custom_objects={'Time2Vector': Time2Vector,
#                                                            'SingleAttention': SingleAttention,
#                                                            'MultiAttention': MultiAttention,
#                                                            'TransformerEncoder': TransformerEncoder,
#                                                            'TransformerDecoder': TransformerDecoder}
#                                            )

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

torch.manual_seed(42)

# Create PyTorch model
model = SWT_Transformer(seq_len, inp_len, out_len, d_k, d_v, n_heads, ff_dim, param_dset_forecast)

# Print model summary
# print(model)
def num_parameters(m):
    return sum([p.numel() for p in m.parameters()])

parameters = num_parameters(model)

# print(f"Expected number of parameters: {m * dk * dk + m * 1 * 1 * n}")
print(f"Actual number of model parameters: {parameters}")

trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad
)
print(f"Trainable model parameters: {trainable_params}" )

# total_params = 0
# for name, parameter in model.named_parameters():
#     if not parameter.requires_grad:
#         continue
#     params = parameter.numel()
#     print(f"{name}, {params}")
#     total_params+=params
# print(f"Total Trainable Params: {total_params}")
    

# Convert data to PyTorch tensors
X_train, y_train = torch.tensor(trainX), torch.tensor(trainY)
X_val, y_val = torch.tensor(valX), torch.tensor(valY)
X_test, y_test = torch.tensor(testX), torch.tensor(testY)

# Create DataLoader for training and validation data
train_dataset = TensorDataset(X_train, y_train)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer and loss function
optimizer = optim.RMSprop(model.parameters(), lr=0.001, eps=1e-07)
# optimizer = optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.MSELoss()

# Training loop
num_epochs = 4  # Replace with your desired number of epochs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in tqdm(range(num_epochs), total= num_epochs, desc="epochs", position=0, leave=True):
    model.train()
    train_loss = 0.0
    for inputs, targets in tqdm(train_loader, total=len(train_loader), desc="train batches", position=1, leave=True):
        inputs, targets = inputs.nan_to_num().to(device), targets.nan_to_num().to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
#         print("targets: ", targets.shape)
#         print("outputs: ", outputs.shape)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
#         for p in model.parameters():
#             print(p.grad.norm())
        
        train_loss += loss.item()
        
    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in tqdm(val_loader, total=len(val_loader), desc="val batches", position=2, leave=True):
            inputs, targets = inputs.nan_to_num().to(device), targets.nan_to_num().to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, targets).item()

    val_loss /= len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')

# Save the PyTorch model
torch.save(model.state_dict(), 'transformer_5min.pth')


Actual number of model parameters: 9370696
Trainable model parameters: 9370696


epochs:   0%|          | 0/4 [00:00<?, ?it/s]

train batches:   0%|          | 0/8 [00:00<?, ?it/s]

val batches:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/4, Training Loss: 0.146346, Validation Loss: 0.049197


train batches:   0%|          | 0/8 [00:00<?, ?it/s]

val batches:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 2/4, Training Loss: 0.054351, Validation Loss: 0.040148


train batches:   0%|          | 0/8 [00:00<?, ?it/s]

val batches:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 3/4, Training Loss: 0.050088, Validation Loss: 0.038004


train batches:   0%|          | 0/8 [00:00<?, ?it/s]

val batches:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 4/4, Training Loss: 0.048867, Validation Loss: 0.036981


In [17]:
# Load the PyTorch model
loaded_model = SWT_Transformer(seq_len, inp_len, out_len, d_k, d_v, n_heads, ff_dim, param_dset_forecast)
loaded_model.load_state_dict(torch.load('transformer_5min.pth'))
loaded_model.to(device)
loaded_model.eval()

# Use the whole signal (both train and validation data)
# The metrics are computed only using the validation part.
# This is needed for the signal processing
print(testX.shape)

# Testing the model on the test dataset
test_outputs = []
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in tqdm(test_loader, total=len(test_loader), desc="test batch"):
        inputs, targets = inputs.nan_to_num().to(device), targets.nan_to_num().to(device)
        outputs = loaded_model(inputs)
        test_outputs.append(outputs.cpu().numpy())  # Collect the outputs
        test_loss += criterion(outputs, targets).item()

testPredict_a = np.concatenate(test_outputs, axis=0)  # Concatenate outputs into a single numpy array
test_loss /= len(test_loader)
print(f'Test Loss: {test_loss:.4f}')

# Result from test
print(testPredict_a.shape)

testX_original = np.transpose(testX, (0, 2, 1))
print(testX_original.shape)

test_cat = np.concatenate((testX_original, testPredict_a), axis=2) 
print(test_cat.shape)

(40, 1632, 6)


test batch:   0%|          | 0/3 [00:00<?, ?it/s]

Test Loss: 0.0769
(40, 6, 48)
(40, 6, 1632)
(40, 6, 1680)


In [22]:

D = test_cat.reshape(test_cat.shape[0],test_cat.shape[1]*test_cat.shape[2])
print(D.shape)

R = D.reshape(test_cat.shape[0],lev,2,test_cat.shape[2])
print(R.shape)

R = data_organization(R)

# print(R)

re=data_reconstruction(R)
Re = np.array(re)
print(Re.shape)


Re = np.nan_to_num(Re)

Re = torch.from_numpy(Re)

(40, 10080)
(40, 3, 2, 1680)


iswt:   0%|          | 0/40 [00:00<?, ?it/s]

(40, 1680)


In [23]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt

print("numpy shape: ", self_series_numpy.shape)
# test_compare = self_series_numpy[160:200, :]
test_compare = self_series[1160:1200, :].nan_to_num()
print("Real test shape: ", test_compare.shape)

test_rmse = math.sqrt( mean_squared_error(test_compare[:,param_dset_lookback:], Re[:,param_dset_lookback:]))
# test_rmse = math.sqrt( mean_squared_error(test_compare, Re))

test_mae=mean_absolute_error(test_compare[:,param_dset_lookback:], Re[:,param_dset_lookback:])
# test_mae=mean_absolute_error(test_compare, Re)

test_smape = (2*(test_compare[:,param_dset_lookback:]-Re[:,param_dset_lookback:]).abs_() / (test_compare[:,param_dset_lookback:].abs() + Re[:,param_dset_lookback:].abs())).nanmean()

# mape=100*np.mean(np.divide(abs(test_compare[:,1632:]- Re[:,1632:]),test_compare[:,1632:]))

numpy shape:  (97143, 1680)
Real test shape:  torch.Size([40, 1680])


In [25]:
print('RMSE:  %.6f' % test_rmse)
print('MAE:  %.6f' % test_mae)
print('sMAPE:  %.6f' % test_smape)

RMSE:  0.171455
MAE:  0.106719
MAPE:  0.765961
