In [12]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

import torch

from sklearn.preprocessing import StandardScaler

from utilities import *
from data_factory.preprocessing import *

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from tqdm import tqdm, notebook
from IPython.display import display

%matplotlib inline
sns.set_style("whitegrid")
notebook.tqdm().pandas()

pl.seed_everything(42)

DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


0it [00:00, ?it/s]

Global seed set to 42


42

In [3]:
config = load_config("../config/config.yml")
config

{'device': 'cpu',
 'seed': False,
 'data': {'save': '../data/save/',
  'train_path': '../data/jpx-tokyo-stock-exchange-prediction/train_files/',
  'test_path': '../data/jpx-tokyo-stock-exchange-prediction/supplemental_files/',
  'financials': 'financials.csv',
  'stock_prices': 'stock_prices.csv',
  'options': 'options.csv',
  'secondary_stock_price': 'secondary_stock_price.csv',
  'trades': 'trades.csv'},
 'sliding_window': {'max_prediction_length': 10,
  'min_prediction_length': 5,
  'max_encoder_length': 50,
  'min_encoder_length': 50,
  'batch_size': 64},
 'model': {'name': 'gmm', 'path': './cache/', 'n_clusters': 4},
 'optimizer': {'name': 'adam',
  'epochs': 10,
  'params': {'lr': 0.001, 'regularization': 0.0001}}}

In [8]:
train_path = config['data']['train_path']
test_path = config['data']['test_path']
stock_prices = config['data']['stock_prices']

## Stock prices

In [9]:
df = pd.read_csv(f'{train_path}/{stock_prices}', parse_dates=["Date"])
df.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324


In [10]:
df_test = pd.read_csv(f'{test_path}/{stock_prices}', parse_dates=["Date"])
df_test.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False,-0.003263
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False,-0.008993


In [13]:
df['Timestamp'] = date_to_timestamp['1d'](df.Date.values.astype(np.int64)).astype(int)
df_test['Timestamp'] = date_to_timestamp['1d'](df_test.Date.values.astype(np.int64)).astype(int)

df.SupervisionFlag = df.SupervisionFlag.astype('category')
df.SecuritiesCode = df.SecuritiesCode.astype(str)

df_test.SupervisionFlag = df_test.SupervisionFlag.astype('category')
df_test.SecuritiesCode = df_test.SecuritiesCode.astype(str)

## Fill na

In [7]:
df.isna().sum(axis=0)

RowId                     0
Date                      0
SecuritiesCode            0
Open                   7608
High                   7608
Low                    7608
Close                  7608
Volume                    0
AdjustmentFactor          0
ExpectedDividend    2313666
SupervisionFlag           0
Target                  238
Timestamp                 0
dtype: int64

In [8]:
df_test.isna().sum(axis=0)

RowId                    0
Date                     0
SecuritiesCode           0
Open                   284
High                   284
Low                    284
Close                  284
Volume                   0
AdjustmentFactor         0
ExpectedDividend    111497
SupervisionFlag          0
Target                   0
Timestamp                0
dtype: int64

In [9]:
print(f'Missing targets train {df.Target.isna().sum()}')
df.dropna(subset=['Target'], inplace=True)
print(f'Missing targets test {df_test.Target.isna().sum()}')
df_test.dropna(subset=['Target'], inplace=True)

Missing targets train 238
Missing targets test 0


In [10]:
df.ExpectedDividend.fillna(value=0, inplace=True)
df.loc[:, ['Open', 'High', 'Low', 'Close']] = df.loc[:, ['Open', 'High', 'Low', 'Close']].fillna(method='ffill')
df_test.ExpectedDividend.fillna(value=0, inplace=True)
df_test.loc[:, ['Open', 'High', 'Low', 'Close']] = df_test.loc[:, ['Open', 'High', 'Low', 'Close']].fillna(method='ffill')

In [11]:
df.isna().sum(axis=0).any()

False

In [12]:
df_test.isna().sum(axis=0).any()

False

### Create dataset

In [13]:
df_train = df.copy()
df_train.sort_values(by=['SecuritiesCode', 'Timestamp'], inplace=True)
df_train.reset_index(drop=True, inplace=True)

df_test.sort_values(by=['SecuritiesCode', 'Timestamp'], inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [14]:
max_prediction_length = 10
min_prediction_length = 2  # For testing
max_encoder_length = 30

is_training = df_train.groupby('SecuritiesCode').apply(lambda x: x.Timestamp < (x.Timestamp.max() - max_prediction_length)).reset_index(drop=True)

training = TimeSeriesDataSet(df_train[is_training], time_idx='Timestamp', target='Close', group_ids=['SecuritiesCode'],
                             allow_missing_timesteps=True,
                             static_categoricals=['SecuritiesCode'],
                             time_varying_unknown_reals=['Open', 'High', 'Low', 'Close', 'Volume'],
                             time_varying_unknown_categoricals=['SupervisionFlag'],
                             time_varying_known_reals=['Timestamp'],
                             # min_encoder_length=345,
                             max_encoder_length=max_encoder_length,
                             max_prediction_length=max_prediction_length,
                             # scalers={col: DummyScaler() for col in  ['Open', 'High', 'Low', 'Close', 'Volume', 'AdjustmentFactor', 'ExpectedDividend']}
                             # target_normalizer=GroupNormalizer(
                             #     groups=['SecuritiesCode'], transformation="softplus"
                             # ),  # use softplus and normalize by group
                             target_normalizer=None
                             # add_relative_time_idx=True,
                             # add_target_scales=True,
                             # add_encoder_length=True,
)


In [15]:
# Have to add min_prediction_length days so that you have a prediction for the last day of you prediction set.

def add_2_days(x: pd.DataFrame):
    a = x.loc[:, 'SecuritiesCode'].iloc[0]
    x = x.copy().set_index('Timestamp', drop=True)
    x = x.reindex(x.index.to_list() + [x.index.max() + 1, x.index.max() + 2])
    x.reset_index(drop=False, inplace=True)
    x.loc[:, 'SecuritiesCode'] = a
    x.loc[:, 'AdjustmentFactor'] = 1.
    x.loc[:, 'Date'] = pd.to_datetime(x.Timestamp, unit='d')
    x.fillna(0, inplace=True)
    return x

df_test_ext = df_test.groupby('SecuritiesCode').apply(add_2_days).reset_index(drop=True)
df_test_ext.head(1)

Unnamed: 0,Timestamp,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,18967,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900.0,1.0,0.0,False,-0.003263
1,18968,20211207_1301,2021-12-07,1301,2998.0,3065.0,2990.0,3065.0,19100.0,1.0,0.0,False,0.009820
2,18969,20211208_1301,2021-12-08,1301,3080.0,3080.0,3035.0,3055.0,11600.0,1.0,0.0,False,0.006483
3,18970,20211209_1301,2021-12-09,1301,3050.0,3085.0,3025.0,3085.0,11700.0,1.0,0.0,False,-0.006441
4,18971,20211210_1301,2021-12-10,1301,3100.0,3105.0,3050.0,3105.0,14700.0,1.0,0.0,False,-0.008104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115995,19047,20220224_9997,2022-02-24,9997,709.0,725.0,708.0,719.0,195600.0,1.0,0.0,False,0.001364
115996,19048,20220225_9997,2022-02-25,9997,725.0,738.0,724.0,733.0,170500.0,1.0,0.0,False,-0.001362
115997,19051,20220228_9997,2022-02-28,9997,731.0,737.0,726.0,734.0,288100.0,1.0,0.0,False,-0.030014
115998,19052,0,2022-03-01,9997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,0.000000


In [33]:
class TestDataLoader:
    """
    This dataloader is a trick not to have the same label to predict twice. This happens because of
    how TimeSeriesDataSet works.
    The batch_size MUST be df_test.Timestamp.unique().size which the number of actual labels you want
    to predict for each category
    """
    def __init__(self, dataset: TimeSeriesDataSet, batch_size, num_workers=12):
        self.data_loader = testing.to_dataloader(train=False, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    def __iter__(self):  # Trick not to load some values twice
        last_group = -1

        for X, (y, w) in self.data_loader:
            if X['groups'][0] <= last_group:
                break
            last_group = X['groups'][0]
            yield data


class TestDataLoader:
    """
    This dataloader is a trick not to have the same label to predict twice. This happens because of
    how TimeSeriesDataSet works.
    The batch_size MUST be df_test.Timestamp.unique().size which the number of actual labels you want
    to predict for each category
    """
    def __init__(self, dataset: TimeSeriesDataSet, batch_size, num_workers=12):
        self.data_loader = testing.to_dataloader(train=False, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    def __iter__(self):  # Trick not to load some values twice
        last_group = -1

        for X, (y, w) in self.data_loader:
            if X['groups'][0] <= last_group:
                break
            last_group = X['groups'][0]
            yield data


# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
df_train_test = pd.concat([df_train, df_test_ext]).sort_values(by=['SecuritiesCode', 'Timestamp']).reset_index(drop=True)

testing = TimeSeriesDataSet.from_dataset(training, df_train_test, predict=False, stop_randomization=True, min_prediction_idx=df_test.Timestamp.min(), min_prediction_length=min_prediction_length)
validation = TimeSeriesDataSet.from_dataset(training, df_train, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=12)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=12)
test_dataloader = TestDataLoader(testing, batch_size=df_test.Timestamp.unique().size)

TypeError: object.__new__() takes exactly one argument (the type to instantiate)

In [17]:
%%time

for data in test_dataloader:
    pass


CPU times: user 9.35 s, sys: 2.75 s, total: 12.1 s
Wall time: 23.5 s


In [21]:
from torch.utils.data import DataLoader

In [26]:
dl = DataLoader(test_dataloader)

AttributeError: 'DataLoader' object has no attribute 'copy'

In [24]:
%%time

for data in dl:
    pass


TypeError: 'DataLoader' object is not subscriptable

In [None]:
all_timestamps = pd.Series(np.arange(df_test.Timestamp.min(), df_test.Timestamp.max() - 2))
valid = all_timestamps.isin(df_test.Timestamp.unique())
last_group = -1

for i, (X, (y, _)) in enumerate(test_dataloader):
    print(X['groups'][0])
    if X['groups'][0] <= last_group:
        break
    last_group = X['groups'][0]
    print(X['groups'][0])
    print(y.shape)


In [None]:
(df_test.Timestamp.max() -2 - df_test.Timestamp.min()) * df_train_test.SecuritiesCode.unique().size

In [None]:
(df_test.Timestamp.max() -2 - df_test.Timestamp.min())

In [None]:
df_test.Timestamp.unique().size


In [None]:
for y in a:
    print(y[:, :2])

## Baseline Model

#### Test

In [None]:
df_train_test[df_train_test.SecuritiesCode.isin(df_train_test.SecuritiesCode.unique()[:2])].SecuritiesCode.value_counts()

In [None]:
df_test_ext.Timestamp.unique().size - 2

In [None]:
e = df_train_test[df_train_test.Timestamp >= df_test.Timestamp.min()]
(e.Date.unique().size - min_prediction_length) * e.SecuritiesCode.unique().size

In [None]:
df_test.Timestamp.unique().size

In [None]:
test_dataloader = testing.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)


In [None]:
a = [y for x, (y, weight) in tqdm(iter(test_dataloader))]

In [None]:
for i in a:
    print(i.shape)

In [None]:
a_ = [y for x, (y, weight) in tqdm(iter(testing))]

In [None]:
for i in a_:
    print(i.shape)

In [None]:
for i in a_:
    print(i.numpy())

In [None]:
f = df_train_test.SecuritiesCode.isin(df_train_test.SecuritiesCode.unique()[:1])
f &= (df_train_test.Timestamp >= df_test.Timestamp.min() - max_encoder_length)

In [None]:
df_test.Timestamp.min()

In [None]:
df_train_test.loc[f, ['Close', 'Timestamp']]

In [None]:
df_test_ext.Timestamp.max() - df_test_ext.Timestamp.min()

In [None]:
min_prediction_length

In [None]:
print(len(a_))

In [None]:
sum([i.shape[0] for i in a])

In [None]:
a[0].shape, a[1].shape, a[-1].shape

In [None]:
len(a)*1280

In [None]:
(df_test.Date.unique().size) * df_test.SecuritiesCode.unique().size

In [None]:
actuals = torch.cat([y for x, (y, weight) in tqdm(iter(test_dataloader))])
baseline_predictions = Baseline().predict(test_dataloader)
(actuals - baseline_predictions).abs().mean().item()

baseline_predictions_np = baseline_predictions.cpu().detach().numpy()
actuals_np = actuals.cpu().detach().numpy()

In [None]:
baseline_predictions.shape

In [None]:
actuals.shape

In [None]:
df_test.SecuritiesCode.unique().size

In [None]:
max_prediction_length, max_encoder_length

In [None]:
ts_max = df_test.Timestamp.max()
s = ts_max - max_prediction_length
base = np.arange(s+1, s+max_prediction_length+1)
real = df_test[(df_test.Timestamp > ts_max - max_prediction_length) & (df_test.SecuritiesCode == '1301')]['Timestamp'][-max_prediction_length:]
f = pd.Series(base).isin(real)

df_test_res_baseline = df_test.copy()

df_test_res_baseline['close_pred'] = np.nan
df_test_res_baseline['close_true'] = np.nan
df_test_res_baseline['target_t0_t-1_pred'] = np.nan
df_test_res_baseline['target_t0_t-1_true'] = np.nan

In [None]:
time_filter = df_test_res_baseline.Timestamp > ts_max - max_prediction_length

df_test_res_baseline.loc[(time_filter), 'close_pred'] = baseline_predictions_np[:, f].astype(np.float64).flatten()
df_test_res_baseline.loc[(time_filter), 'close_true'] = actuals_np[:, f].astype(np.float64).flatten()

In [None]:
(df_test_res_baseline[df_test_res_baseline.close_true.notna()].Close == df_test_res_baseline[df_test_res_baseline.close_true.notna()].close_true).all()

In [None]:
(df_test_res_baseline.groupby('SecuritiesCode').close_true.diff().astype(str) == df_test_res_baseline.groupby('SecuritiesCode').close_true.diff().reset_index(drop=True).astype(str)).all()

In [None]:
df_test_res_baseline.groupby('SecuritiesCode').close_true.diff().reset_index(drop=True)

In [None]:
df_test_res_baseline['target_t0_t-1_pred'] = df_test_res_baseline.groupby('SecuritiesCode').close_pred.diff().reset_index(drop=True)
df_test_res_baseline['target_t0_t-1_true'] = df_test_res_baseline.groupby('SecuritiesCode').close_true.diff().reset_index(drop=True)

In [None]:
df_test_res_baseline['target_t2_t1_pred'] = df_test_res_baseline['target_t0_t-1_pred'].shift(-2)
df_test_res_baseline['target_t2_t1_true'] = df_test_res_baseline['target_t0_t-1_true'].shift(-2)

df_test_res_baseline['target_pred'] = df_test_res_baseline['target_t2_t1_pred'] / df_test_res_baseline.close_pred.shift(-1)
df_test_res_baseline['target_true'] = df_test_res_baseline['target_t2_t1_true'] / df_test_res_baseline.close_true.shift(-1)

In [None]:
max_encoder_length

In [None]:
df_test_res_baseline.loc[:, ['target_t0_t-1_pred', 'target_t0_t-1_true', 'target_t2_t1_pred', 'target_t2_t1_true', 'Target', 'target_pred', 'target_true']]

In [None]:
## Keep only result which have been predicted
df_test_res = df_test_res[df_test_res.Timestamp > ts_max - max_prediction_length].copy()
df_test_res

In [None]:
df_test_res['Rank'] = (df_test_res.groupby("Date")["Target"].rank(ascending=False, method="first") - 1).astype(int)
df_test_res

In [None]:
from utilities.evaluation import calc_spread_return_sharpe

calc_spread_return_sharpe(df_test_res)

## Temporal Fusion Transformer

In [None]:
# configure network and trainer
trainer = pl.Trainer(
    accelerator='gpu',
    # clipping gradients is a hyperparameter and important to prevent divergence
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)



tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 a!re good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
    log_interval=-1
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
%%time

# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

#### Train the model

In [None]:
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    accelerator="gpu", 
    max_epochs=1000,
    weights_summary="top",
    gradient_clip_val=0.1,
    limit_train_batches=30,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=3.235936569296285/3,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# fit network
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

In [None]:
target
1 2 3 -> 4 5 6 -> Ranking

1- define ranking target
2- rank

In [None]:
torch.__version__

### Results

In [None]:
actuals = torch.cat([y for x, (y, weight) in iter(test_dataloader)])
predictions = tft.predict(test_dataloader)
print((actuals - predictions).abs().mean().item())

predictions_np = predictions.cpu().detach().numpy()
actuals_np = actuals.cpu().detach().numpy()

In [None]:
(actuals - baseline_predictions).abs().mean().item(), (actuals - predictions).abs().mean().item()

In [None]:
baseline_predictions_np[0][0], baseline_predictions[0][0].item()

In [None]:
baseline_predictions_np.shape, baseline_predictions.shape

In [None]:
predictions_np[0][0], predictions[0][0].item()

In [None]:
ts_max = df_test.Timestamp.max()
s = ts_max - max_prediction_length
base = np.arange(s+1, s+max_prediction_length+1)
real = df_test[(df_test.Timestamp > ts_max - max_prediction_length) & (df_test.SecuritiesCode == '1301')]['Timestamp'][-max_prediction_length:]
f = pd.Series(base).isin(real)

df_test_res = df_test.copy()
df_test_res_baseline = df_test.copy()

df_test_res['target_t0_t-1_pred'] = np.nan
df_test_res['target_t0_t-1_true'] = np.nan

df_test_res_baseline['target_t0_t-1_pred'] = np.nan
df_test_res_baseline['target_t0_t-1_true'] = np.nan

df_test_res['Target_pred'] = np.nan
df_test_res['Target_true'] = np.nan

df_test_res_baseline['Target_pred'] = np.nan
df_test_res_baseline['Target_true'] = np.nan


In [None]:
df_test_res.loc[(df_test_res.Timestamp > ts_max - max_prediction_length), 'target_t0_t-1_pred'] = predictions_np[:, f].flatten()
df_test_res.loc[(df_test_res.Timestamp > ts_max - max_prediction_length), 'target_t0_t-1_true'] = actuals_np[:, f].flatten()

# df_test_res['Target_true'] = df_test_res['target_t0_t-1_true'].shift(-2)
df_test_res = shift(df_test_res, from_='target_t0_t-1_true', to_='target_t0_t-1', shift=2)


df_test_res_baseline.loc[(df_test_res.Timestamp > ts_max - max_prediction_length), 'target_t0_t-1_pred'] = baseline_predictions_np[:, f].flatten()
df_test_res_baseline.loc[(df_test_res.Timestamp > ts_max - max_prediction_length), 'target_t0_t-1_true'] = actuals_np[:, f].flatten()




In [None]:
(df_test_res.Timestamp > ts_max - max_prediction_length).sum()

In [None]:
## Keep only result which have been predicted
df_test_res = df_test_res[df_test_res.Timestamp > ts_max - max_prediction_length].copy()
df_test_res_baseline = df_test_res_baseline[df_test_res_baseline.Timestamp > ts_max - max_prediction_length].copy()
df_test_res

In [None]:
df_test_res['Rank'] = (df_test_res.groupby("Date")["target_t0_t-1_true"].rank(ascending=False, method="first") - 1).astype(int)
df_test_res_baseline['Rank'] = (df_test_res_baseline.groupby("Date")["target_t0_t-1_true"].rank(ascending=False, method="first") - 1).astype(int)

In [None]:
from utilities.evaluation import calc_spread_return_sharpe

calc_spread_return_sharpe(df_test_res), calc_spread_return_sharpe(df_test_res_baseline)