# Exploring the Informer model
- The components and how it is made

In [100]:
import torch
import pandas as pd
from pandas.tseries.frequencies import to_offset

from datetime import datetime
from data.data_loader import Dataset_Custom

## Encoding of the raw timeseries
- --> into a representation with temporal encoding embedded into the representation

In [83]:
root_path = '.'
flag = 'train'  
size = [72, 48, 24] # seq_len, lbl_len, pred_len 
features = 'MS' # Multivariate feautures IN, Single OUT 
data_path = 'dbank_h.csv'
target = 'close'
scale = True 
inverse = False
timenc = 0 
freq = 'h'
train_data = Dataset_Custom(root_path, flag, size, features, data_path, 
                                          target, scale, inverse, timenc, freq)

In [84]:
# data is processed by window sliding across it
# x will have seq_len length sequence for each of the columns in this case ohlc + vol
# y will have the last lbl_len of seq_len + pred_len 
x, y, x_mark, y_mark = train_data[0]
x.shape, y.shape, x_mark.shape, y_mark.shape

((72, 5), (72, 5), (72, 4), (72, 4))

In [85]:
# x_mark and y_mark are the the timestamp info
# for timenc = 0, no encoding is done, so will have: [month, day, weekday, hour]
x_mark[:3]

array([[ 3,  9,  0, 15],
       [ 3, 10,  1,  8],
       [ 3, 10,  1,  9]])

In [86]:
# setting timenc = 1 --> 'timeF' will encode time between -0.5 to 0.5 
train_data_enc = Dataset_Custom(root_path, flag, size, features, data_path, 
                                          target, scale, inverse, 1, freq)
x, y, x_mark_e, y_mark_e = train_data_enc[0]
x_mark_e[:3]

array([[ 0.15217391, -0.5       , -0.23333333, -0.31369863],
       [-0.15217391, -0.33333333, -0.2       , -0.3109589 ],
       [-0.10869565, -0.33333333, -0.2       , -0.3109589 ]])

In [113]:
# again because freq = 'h' we can see in utils.features.timefeatures 
# that the time features are encoded as: [Hour of day, day of week, day of month, day of year]
# it is encoded to be between -0.5 to 0.5 by following computations:
hourOfDay = lambda x: (x[3]/23.0) - 0.5
dayOfWeek = lambda x: (x[2]/6) - 0.5
dayOfMonth = lambda x: ((x[1] - 1)/30.0) - 0.5
DayOfYear = lambda x: ((x[0] - 1)/ 365) - 0.5 # this is not correct
encode_time = lambda x: torch.tensor([hourOfDay(x), dayOfWeek(x), dayOfMonth(x), DayOfYear(x)])
torch.vstack([encode_time(x_mark[i]) for i in range(3)])

tensor([[ 0.1522, -0.5000, -0.2333, -0.4945],
        [-0.1522, -0.3333, -0.2000, -0.4945],
        [-0.1087, -0.3333, -0.2000, -0.4945]], dtype=torch.float64)

In [114]:
# Correct way
# to get features as day of year we need to use some pandas tools
convert_to_datetime = lambda x: datetime(year=2020, month=x[0], day=x[1], hour=x[3])
dates = pd.to_datetime([convert_to_datetime(x) for x in x_mark])
offset = to_offset('h')
hourOfDay_ = lambda x: (x.hour/23.0) - 0.5
dayOfWeek_ = lambda x: (x.dayofweek/6.0) - 0.5
dayOfMonth_ = lambda x: ((x.day - 1)/30.0) - 0.5
DayOfYearCorrect = lambda x: ((x.dayofyear - 1)/ 365) - 0.5
# trying again now 
encode_time_correct = lambda x: torch.tensor([hourOfDay_(x), dayOfWeek_(x), dayOfMonth_(x), DayOfYearCorrect(x)])
torch.vstack([encode_time_correct(dates[i]) for i in range(3)])

tensor([[ 0.1522, -0.5000, -0.2333, -0.3137],
        [-0.1522, -0.3333, -0.2000, -0.3110],
        [-0.1087, -0.3333, -0.2000, -0.3110]])

### DataEmbedding from models.embed
Forward (x, x_mark)

    - Return: dropout(value_embedding(x) + position_embedding(x) + temporal_embedding(x_mark))


- we explore each of the embeddings

In [117]:
c_in = x.shape[1] # number of features in input
d_model = 512 # dimension of the model, matches with n_heads = 8

#### value_embedding = models.embed.TokenEmbedding

64.0