In [22]:
import pytorch_forecasting
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import numpy as np

import torch

from sklearn.preprocessing import StandardScaler

from utilities import *

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from tqdm import tqdm, notebook

% matplotlib inline
sns.set_style("whitegrid")
notebook.tqdm().pandas()

pl.seed_everything(42)

DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


0it [00:00, ?it/s]

Global seed set to 42


42

In [23]:
import pickle

(training, df) = pickle.load(open("test.p", "rb"))


In [25]:
df

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Timestamp
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400.0,1.0,0.0,False,0.000730,17170
1,20170105_1301,2017-01-05,1301,2743.0,2747.0,2735.0,2738.0,17900.0,1.0,0.0,False,0.002920,17171
2,20170106_1301,2017-01-06,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,False,-0.001092,17172
3,20170110_1301,2017-01-10,1301,2745.0,2754.0,2735.0,2748.0,24200.0,1.0,0.0,False,-0.005100,17176
4,20170111_1301,2017-01-11,1301,2748.0,2752.0,2737.0,2745.0,9300.0,1.0,0.0,False,-0.003295,17177
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,20220224_1301,2022-02-24,1301,3195.0,3205.0,3175.0,3195.0,15200.0,1.0,0.0,False,0.036508,19047
1256,20220225_1301,2022-02-25,1301,3160.0,3185.0,3135.0,3150.0,22200.0,1.0,0.0,False,-0.004594,19048
1257,20220228_1301,2022-02-28,1301,3180.0,3265.0,3170.0,3265.0,28700.0,1.0,0.0,False,-0.007692,19051
1258,0,2022-03-01,1301,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,0.000000,19052


In [207]:
min_prediction_idx = 18967
max_prediction_length = 10
min_prediction_length = 2  # For testing
max_encoder_length = 30

r = pd.DataFrame({
    'grp': np.ones(df_.Timestamp.shape[0]),
    'timestamp': df_.Timestamp,
    'values':    df_.Timestamp
})

In [257]:
min_prediction_idx = 8
max_prediction_length = 3
min_prediction_length = 2
max_encoder_length = 5
min_encoder_length = 5

r = pd.DataFrame({
    'grp': np.ones(8),
    'timestamp': [1, 2,  4, 5,   8, 10, 12, 13],
    'values':    [1, 2,  4, 5,   8, 10, 12, 13]
})

In [67]:
r = pd.DataFrame({
    'grp': np.ones(10),
    'timestamp': [1, 2, 3, 4, 5,   6, 7, 8, 9, 10],
    'values':    [1, 2, 3, 4, 5,   6, 7, 8, 9, 10]
})

In [68]:
from logging import INFO
pytorch_forecasting.data.timeseries.logger.setLevel(INFO)

In [None]:
"""
Problems
    Start the prediction wherever it wants (does not correctly respect min_prediction_idx)
    slow
    add new stuff to compute at the end of the iteration
    fill missing timestamps to do prediction
"""

In [206]:
to_day_number = lambda x: x / 10 ** 9 / (24 * 60 * 60)

def fill_missing_dates(df: pd.DataFrame):

    def fill_na(data: pd.DataFrame, date_col='Date', ts_col='Timestamp', freq='1d'):
        start = data[date_col].min()
        end = data[date_col].max()
        idx = pd.date_range(start=start, end=end, freq='1d')
        data.set_index(date_col, inplace=True)
        data = data.reindex(idx)

        data[ts_col] = to_day_number(data.index.astype(int)).astype(int)
        data.SecuritiesCode = data.SecuritiesCode.unique()[0]

        data.fillna(method="ffill", inplace=True)  # ffill: propagate last valid observation forward to next valid
        return data.reset_index(drop=False).rename(columns={'index': date_col})

    df = df.groupby('SecuritiesCode').apply(fill_na).reset_index(drop=True)
    return df


df_ = fill_missing_dates(df)
df_.drop(index=df_.loc[(df_.Timestamp >= min_prediction_idx) & ~(df_.Timestamp.isin(df.Timestamp))].index, inplace=True)
df_

Unnamed: 0,Date,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Timestamp
0,2017-01-04,20170104_1301,1301,2734.0,2755.0,2730.0,2742.0,31400.0,1.0,0.0,False,0.000730,17170
1,2017-01-05,20170105_1301,1301,2743.0,2747.0,2735.0,2738.0,17900.0,1.0,0.0,False,0.002920,17171
2,2017-01-06,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,False,-0.001092,17172
3,2017-01-07,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,False,-0.001092,17173
4,2017-01-08,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,False,-0.001092,17174
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,2022-02-24,20220224_1301,1301,3195.0,3205.0,3175.0,3195.0,15200.0,1.0,0.0,False,0.036508,19047
1878,2022-02-25,20220225_1301,1301,3160.0,3185.0,3135.0,3150.0,22200.0,1.0,0.0,False,-0.004594,19048
1881,2022-02-28,20220228_1301,1301,3180.0,3265.0,3170.0,3265.0,28700.0,1.0,0.0,False,-0.007692,19051
1882,2022-03-01,0,1301,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,0.000000,19052


In [258]:

testing = TimeSeriesDataSet(r, time_idx='timestamp', target= 'values', time_varying_unknown_reals=['values'], time_varying_known_reals=['timestamp'],
                            group_ids=['grp'], max_encoder_length=max_encoder_length, min_encoder_length=min_encoder_length, max_prediction_length=max_prediction_length,
                            min_prediction_idx=min_prediction_idx, min_prediction_length=min_prediction_length,
                            allow_missing_timesteps=True, target_normalizer=None)


In [218]:
start = df_['Date'].min()
end = df_['Date'].max()
idx = pd.date_range(start=start, end=end, freq='1d')


In [256]:
df_.groupby('SecuritiesCode').Timestamp.apply(min)

SecuritiesCode
1301    17170
Name: Timestamp, dtype: int64

In [266]:
t = df_.copy()

for i in tqdm(range(1, 31, 1)):
    t[f'Close_{i}'] = t.groupby('SecuritiesCode').Close.transform('shift', i).to_frame('oui')

t

100%|██████████| 30/30 [00:00<00:00, 590.81it/s]


Unnamed: 0,Date,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,...,Close_21,Close_22,Close_23,Close_24,Close_25,Close_26,Close_27,Close_28,Close_29,Close_30
0,2017-01-04,20170104_1301,1301,2734.0,2755.0,2730.0,2742.0,31400.0,1.0,0.0,...,,,,,,,,,,
1,2017-01-05,20170105_1301,1301,2743.0,2747.0,2735.0,2738.0,17900.0,1.0,0.0,...,,,,,,,,,,
2,2017-01-06,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,...,,,,,,,,,,
3,2017-01-07,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,...,,,,,,,,,,
4,2017-01-08,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,2022-02-24,20220224_1301,1301,3195.0,3205.0,3175.0,3195.0,15200.0,1.0,0.0,...,3145.0,3095.0,3060.0,3055.0,3115.0,3125.0,3105.0,3100.0,3110.0,3085.0
1878,2022-02-25,20220225_1301,1301,3160.0,3185.0,3135.0,3150.0,22200.0,1.0,0.0,...,3130.0,3145.0,3095.0,3060.0,3055.0,3115.0,3125.0,3105.0,3100.0,3110.0
1881,2022-02-28,20220228_1301,1301,3180.0,3265.0,3170.0,3265.0,28700.0,1.0,0.0,...,3130.0,3130.0,3145.0,3095.0,3060.0,3055.0,3115.0,3125.0,3105.0,3100.0
1882,2022-03-01,0,1301,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3110.0,3130.0,3130.0,3145.0,3095.0,3060.0,3055.0,3115.0,3125.0,3105.0


In [None]:
_group_ids = 'SecuritiesCode'
data = df_

In [None]:
g = data.groupby(_group_ids, observed=True)

df_index_first = g["__time_idx__"].transform("nth", 0).to_frame("time_first")
df_index_last = g["__time_idx__"].transform("nth", -1).to_frame("time_last")
df_index_diff_to_next = -g["__time_idx__"].diff(-1).fillna(-1).astype(int).to_frame("time_diff_to_next")
df_index = pd.concat([df_index_first, df_index_last, df_index_diff_to_next], axis=1)
df_index["index_start"] = np.arange(len(df_index))
df_index["time"] = data["__time_idx__"]
df_index["count"] = (df_index["time_last"] - df_index["time_first"]).astype(int) + 1
sequence_ids = g.ngroup()
df_index["sequence_id"] = sequence_ids

In [230]:
to_day_number(pd.Series(idx)[~pd.Series(idx).isin((df_['Date']))].min().value)

18972.0

In [210]:
r

Unnamed: 0,grp,timestamp,values
0,1.0,17170,17170
1,1.0,17171,17171
2,1.0,17172,17172
3,1.0,17173,17173
4,1.0,17174,17174
...,...,...,...
1877,1.0,19047,19047
1878,1.0,19048,19048
1881,1.0,19051,19051
1882,1.0,19052,19052


In [211]:
min_prediction_idx, max_prediction_length, min_prediction_length

(18967, 10, 2)

In [246]:
r.values[-1]

array([1.0000e+00, 1.9053e+04, 1.9053e+04])

In [244]:

for X, (y, _) in testing:
    print(y.numpy(), X['x_cont'].numpy()[:, 2])



[18967. 18968. 18969. 18970. 18971. 18971. 18971. 18974. 18975. 18976.] [18937. 18938. 18939. 18940. 18941. 18942. 18943. 18944. 18945. 18946.
 18947. 18948. 18949. 18950. 18951. 18952. 18953. 18954. 18955. 18956.
 18957. 18958. 18959. 18960. 18961. 18962. 18963. 18964. 18965. 18966.
 18967. 18968. 18969. 18970. 18971. 18971. 18971. 18974. 18975. 18976.]
[18968. 18969. 18970. 18971. 18971. 18971. 18974. 18975. 18976. 18977.] [18938. 18939. 18940. 18941. 18942. 18943. 18944. 18945. 18946. 18947.
 18948. 18949. 18950. 18951. 18952. 18953. 18954. 18955. 18956. 18957.
 18958. 18959. 18960. 18961. 18962. 18963. 18964. 18965. 18966. 18967.
 18968. 18969. 18970. 18971. 18971. 18971. 18974. 18975. 18976. 18977.]
[18969. 18970. 18971. 18971. 18971. 18974. 18975. 18976. 18977. 18978.] [18939. 18940. 18941. 18942. 18943. 18944. 18945. 18946. 18947. 18948.
 18949. 18950. 18951. 18952. 18953. 18954. 18955. 18956. 18957. 18958.
 18959. 18960. 18961. 18962. 18963. 18964. 18965. 18966. 18967. 18968.
 

In [238]:
pd.Series([y[0].item() for X, (y, _) in testing]).diff()

0      NaN
1      1.0
2      1.0
3      0.0
4      0.0
      ... 
98     1.0
99     6.0
100    1.0
101    1.0
102    6.0
Length: 103, dtype: float64

In [240]:
df.Timestamp.max() - min_prediction_idx

86

In [213]:
for X, (y, _) in testing:
    print(X['encoder_target'])

tensor([18937., 18938., 18939., 18940., 18941., 18942., 18943., 18944., 18945.,
        18946., 18947., 18948., 18949., 18950., 18951., 18952., 18953., 18954.,
        18955., 18956., 18957., 18958., 18959., 18960., 18961., 18962., 18963.,
        18964., 18965., 18966.], dtype=torch.float64)
tensor([18938., 18939., 18940., 18941., 18942., 18943., 18944., 18945., 18946.,
        18947., 18948., 18949., 18950., 18951., 18952., 18953., 18954., 18955.,
        18956., 18957., 18958., 18959., 18960., 18961., 18962., 18963., 18964.,
        18965., 18966., 18967.], dtype=torch.float64)
tensor([18939., 18940., 18941., 18942., 18943., 18944., 18945., 18946., 18947.,
        18948., 18949., 18950., 18951., 18952., 18953., 18954., 18955., 18956.,
        18957., 18958., 18959., 18960., 18961., 18962., 18963., 18964., 18965.,
        18966., 18967., 18968.], dtype=torch.float64)
tensor([18940., 18941., 18942., 18943., 18944., 18945., 18946., 18947., 18948.,
        18949., 18950., 18951., 18952.

In [214]:
r

Unnamed: 0,grp,timestamp,values
0,1.0,17170,17170
1,1.0,17171,17171
2,1.0,17172,17172
3,1.0,17173,17173
4,1.0,17174,17174
...,...,...,...
1877,1.0,19047,19047
1878,1.0,19048,19048
1881,1.0,19051,19051
1882,1.0,19052,19052


In [215]:
r.timestamp.max()-2 - min_prediction_idx

84

In [216]:
len(testing)

103

In [2]:
from importlib import reload

pytorch_forecasting = reload(pytorch_forecasting)

In [19]:
from logging import INFO, DEBUG

pytorch_forecasting.data.timeseries.logger.setLevel(INFO)

In [14]:
size = 100000

r = pd.DataFrame({'timestamp': np.arange(0, size), 'target': np.random.rand(size), 'grp': np.ones(size)})
rts = pytorch_forecasting.data.TimeSeriesDataSet(r, time_idx='timestamp', target='target', group_ids=['grp'],
                                                 time_varying_unknown_reals=['target'],
                                                 time_varying_known_reals=['timestamp'],
                                                 max_encoder_length=2,
                                                 max_prediction_length=2,
                                                 )

X, (y, _) = next(iter(rts))

a = X['encoder_target'][0].item()
print(a == r.target[0])

True


In [15]:
dl = rts.to_dataloader(batch_size=10000)



In [16]:
import logging
from logging import INFO

logging.basicConfig(level=INFO)

In [20]:
pytorch_forecasting.data.timeseries.logger.setLevel(DEBUG)

In [21]:
for i, x in tqdm(dl):
    a = i

  0%|          | 0/9 [00:00<?, ?it/s]DEBUG:pytorch_forecasting.data.timeseries:__getitem__ idx 10920
DEBUG:pytorch_forecasting.data.timeseries:__getitem__ index time_first                0
time_last             99999
time_diff_to_next         1
index_start           10920
time                  10920
count                100000
sequence_id               0
index_end             10923
sequence_length           4
Name: 10920, dtype: int64
DEBUG:pytorch_forecasting.data.timeseries:Get item return
DEBUG:pytorch_forecasting.data.timeseries:__getitem__ idx 13988
DEBUG:pytorch_forecasting.data.timeseries:__getitem__ index time_first                0
time_last             99999
time_diff_to_next         1
index_start           13988
time                  13988
count                100000
sequence_id               0
index_end             13991
sequence_length           4
Name: 13988, dtype: int64
DEBUG:pytorch_forecasting.data.timeseries:Get item return
DEBUG:pytorch_forecasting.data.timeseries:_

KeyboardInterrupt: 

In [72]:
for i in rts:
    a = i

In [21]:
np.array([1, 1]).astype('float64').dtype.kind

'f'

In [22]:

r = pd.DataFrame(
    {'timestamp': np.arange(0, 10), 'target_1': np.random.rand(10), 'target_2': np.random.rand(10), 'grp': np.ones(10)})
rts = TimeSeriesDataSet(r, time_idx='timestamp', target=['target_1', 'target_2'], group_ids=['grp'],
                        time_varying_unknown_reals=['target_1', 'target_2'],
                        time_varying_known_reals=['timestamp'],
                        max_encoder_length=2,
                        max_prediction_length=2,
                        )

print(r.target_1[0])
for X, (y, _) in rts:
    # print(X.keys())
    # print(X['x_cont'].dtype)
    print(X['encoder_target'][0][0].item())
    a = X['encoder_target'][0][0].item()
    # print(X['encoder_target'][0].item())
    break
print(f'problem is fixed ? {a == r.target_1[0]}')

0.020584494295802447
0.020584494295802447
problem is fixed ? True


In [23]:
a = np.array([True, False])
a.dtype.kind

'b'

In [24]:
b = np.array([1, 2])
b.dtype.kind

'i'

In [25]:
torch.from_numpy(a).dtype

torch.bool

In [26]:
torch.from_numpy(b).dtype

torch.int32

In [27]:
torch.tensor(b, dtype=torch.long).dtype

torch.int64

In [28]:
torch.tensor(pd.Series(np.random.randint(0, 10, (10000))).to_numpy(dtype=np.int64), dtype=torch.long).dtype

torch.int64

In [29]:
torch.from_numpy(pd.Series(np.random.randint(0, 10, (10000))).to_numpy(dtype=np.int64)).dtype

torch.int64

In [30]:
% timeit torch.tensor(pd.Series(np.random.randint(0, 10, (10000))).to_numpy(dtype=np.int64), dtype=torch.long).dtype
% timeit torch.from_numpy(pd.Series(np.random.randint(0, 10, (10000))).to_numpy(dtype=np.int64).copy()).dtype


372 µs ± 57.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
300 µs ± 5.67 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
torch.int64 == torch.long

True

In [47]:
np.__version__

'1.22.3'

In [65]:
a = np.random.randint(1, 10, 10)

print(torch.from_numpy(a.astype(np.float64)).dtype)
print(torch.from_numpy(a.astype(np.int64)).dtype)


torch.float64
torch.int64


In [49]:
pl.__version__

'1.6.3'

In [60]:
r = np.random.randint(0, 10, (10000))

a = torch.tensor(pd.Series(r).to_numpy(dtype=np.int64), dtype=torch.int64)
b = torch.from_numpy(pd.Series(r).to_numpy(dtype=np.int64).copy())
c = torch.from_numpy(pd.Series(r).to_numpy(dtype=np.int64)).clone()
d = torch.tensor(pd.Series(r).to_numpy(), dtype=torch.int64)

% timeit torch.tensor(pd.Series(r).to_numpy(dtype=np.int64), dtype=torch.long)
% timeit torch.from_numpy(pd.Series(r).to_numpy(dtype=np.int64).copy())
% timeit torch.from_numpy(pd.Series(r).to_numpy(dtype=np.int64)).clone()
% timeit torch.tensor(pd.Series(r).to_numpy(), dtype=torch.long)


106 µs ± 3.91 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
83.8 µs ± 9.3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
101 µs ± 6.35 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
96.2 µs ± 1.63 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [46]:
r = np.random.rand(10000)

% timeit a = torch.tensor(pd.Series(r).to_numpy(dtype=np.float64), dtype=torch.float64)
% timeit b = torch.from_numpy(pd.Series(r).to_numpy(dtype=np.float64).copy())
% timeit c = torch.from_numpy(pd.Series(r).to_numpy(dtype=np.float64)).clone()
% timeit d = torch.tensor(pd.Series(r).to_numpy(), dtype=torch.float64)



91.5 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
71.7 µs ± 1.13 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
86.1 µs ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
92.3 µs ± 1.52 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
% timeit[_ for _ in a]
% timeit b[12]
% timeit c[12]
% timeit d[12]

2.14 µs ± 22.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.13 µs ± 98.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.1 µs ± 74.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.19 µs ± 71.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [61]:
% timeit[_ for _ in a]
% timeit[_ for _ in b]
% timeit[_ for _ in c]
% timeit[_ for _ in d]

14.8 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
14.8 ms ± 803 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14.1 ms ± 729 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14.1 ms ± 708 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
d

NameError: name 'd' is not defined

In [33]:
a = np.random.rand(10)
a.dtype

dtype('float64')

In [34]:
b = a.astype(np.float32)

In [35]:
c = b.astype(np.float64).astype(np.float32)

In [36]:
a[0], b[0], c[0]

(0.17052793513817988, 0.17052794, 0.17052794)