In [13]:
import sys
sys.path.append("/home/cyprien/Documents/github/pytorch-forecasting")
sys.path.append('../../')

import hashlib

from data_factory.preprocessing import *
from utilities.config import load_config

from pytorch_forecasting import TimeSeriesDataSet

logging.basicConfig(level=DEBUG)
logger = logging.getLogger(__name__)
logger.setLevel(DEBUG)

In [14]:
config_file = '../../config/config.yml'
use_previous_files = False

In [15]:
config = load_config(config_file)

model = config['model']
model_config = config[model]

# Create variables from config
#  data loading
save_folder = config['data']['save']
train_file = config['data']['train_path'] + config['data']['stock_prices']
test_file = config['data']['test_path'] + config['data']['stock_prices']
#  TimeSeries settings
max_prediction_length = model_config['sliding_window']['max_prediction_length']
min_prediction_length = model_config['sliding_window']['min_prediction_length']
max_encoder_length = model_config['sliding_window']['max_encoder_length']
min_encoder_length = model_config['sliding_window']['min_encoder_length']
batch_size = model_config['sliding_window']['batch_size']

related_stocks = model_config['related_stock']
train_val_split = model_config['train_val_split']
scale = model_config['manual_scale']

# define file name for saving StockPricesLoader with specific config
hash_ = hashlib.md5(model_config.__str__().encode('utf-8')).hexdigest()
export_file_name = f"{save_folder}/export_{hash_}.p"

logger.debug(f'Export file {export_file_name}')
logger.debug(f'Use config {config}')


DEBUG:__main__:Export file ../data/save//export_33330b8610da212bf1f1161f2e2ce8c7.p
DEBUG:__main__:Use config {'device': 'cpu', 'seed': False, 'model': 'temporal_fusion_transformer', 'data': {'save': '../data/save/', 'suppl': '../data/suppl/', 'train_path': '../data/jpx-tokyo-stock-exchange-prediction/train_files/', 'test_path': '../data/jpx-tokyo-stock-exchange-prediction/supplemental_files/', 'financials': 'financials.csv', 'stock_prices': 'stock_prices.csv', 'options': 'options.csv', 'secondary_stock_price': 'secondary_stock_prices.csv', 'trades': 'trades.csv', 'cosine': 'cosine_df.csv'}, 'rnn': {'sliding_window': {'max_prediction_length': 10, 'min_prediction_length': 10, 'max_encoder_length': 80, 'min_encoder_length': 80, 'batch_size': 64}, 'train_val_split': 1, 'related_stock': 2, 'manual_scale': True, 'hidden_size': 20, 'layers': 3, 'dropout': 0}, 'temporal_fusion_transformer': {'sliding_window': {'max_prediction_length': 5, 'min_prediction_length': 5, 'max_encoder_length': 150, '

### Load

In [16]:
from data_factory.prepared_data import PreparedData

data: PreparedData = PreparedData.from_file("../../data/save/augmented_data.pkl")
df_train = data.train
df_val = data.val
df_test = data.test

# Create timestamp for TimeSeriesDataset

#### Train

In [17]:
target = 'Close_scaled'

static_categoricals = ['SecuritiesCode', 'sector_group', 'business_group', 'financial_group']

time_varying_known_categoricals = ['dayofweek']
time_varying_unknown_categoricals = ['SupervisionFlag']
time_idx = 'Timestamp_1'

unused = [
    'Date', 'RowId', 'Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp', 'Timestamp_1', 
    'SupervisionFlag', 'Target', 'authentic', 'is_testing', 'is_val'
]

time_varying_unknown_reals = list(set(df_train.columns) - set(unused) - set(static_categoricals) - 
                                  set(time_varying_known_categoricals) - set(time_varying_unknown_categoricals)
                                 - {time_idx})
time_varying_unknown_reals, len(time_varying_unknown_reals)

(['predict_target_topb_1',
  'Close_scaled_tops_2',
  'Close_scaled_topb_2',
  'predict_target_tops_0',
  'predict_target',
  'High_scaled',
  'Close_scaled_topb_1',
  'Close_scaled_topf_0',
  'predict_target_ewm_10',
  'predict_target_ewm_3',
  'predict_target_topf_2',
  'Close_scaled_topb_0',
  'Close_scaled_tops_1',
  'predict_target_tops_1',
  'Close_scaled_topf_1',
  'Close_scaled_ewm_3',
  'Open_scaled',
  'predict_target_topf_1',
  'Low_scaled',
  'predict_target_topf_0',
  'Close_scaled_ewm_10',
  'predict_target_topb_2',
  'Close_scaled_tops_0',
  'predict_target_tops_2',
  'Close_scaled',
  'Volume_scaled',
  'Close_scaled_topf_2',
  'predict_target_topb_0',
  'ExpectedDividend',
  'AdjustmentFactor'],
 30)

In [18]:
df_train.loc[:, static_categoricals] = df_train.loc[:, static_categoricals].astype(str)
df_val.loc[:, static_categoricals] = df_val.loc[:, static_categoricals].astype(str)
df_test.loc[:, static_categoricals] = df_test.loc[:, static_categoricals].astype(str)

In [19]:
df_train_timeseries = TimeSeriesDataSet(
    df_train,
    time_idx=time_idx,
    target=target,
    group_ids=['SecuritiesCode'],
    allow_missing_timesteps=False,

    static_categoricals=static_categoricals,

    time_varying_unknown_reals=time_varying_unknown_reals,
    time_varying_unknown_categoricals=time_varying_unknown_categoricals,
    time_varying_known_reals=[],
    time_varying_known_categoricals=time_varying_known_categoricals,

    min_encoder_length=min_encoder_length,
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    min_prediction_length=min_prediction_length,

    scalers={col: None for col in list(set(time_varying_unknown_reals) - {target})},
    target_normalizer=None,
    add_relative_time_idx=model == 'temporal_fusion_transformer',
    add_target_scales=False,
    add_encoder_length=False,
)

#### Validation

In [20]:
df_val_timeseries = TimeSeriesDataSet.from_dataset(
    df_train_timeseries, df_val,
    predict=False,
    stop_randomization=True,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length
)

#### Test

In [21]:
df_test_timeseries = TimeSeriesDataSet.from_dataset(
    df_train_timeseries, df_test,
    allow_missing_timesteps=False,
    predict=False,
    stop_randomization=True,
    min_prediction_idx=df_test[df_test.is_testing == True].Timestamp_1.min() + 1,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length
)

In [22]:
## Batch size of test set to predict one SecuritiesCode at a time.
# Created to make debug easier.

test_set_size = df_test[df_test.is_testing == True].Timestamp_1.max() - df_test[df_test.is_testing == True].Timestamp_1.min() + 1

### DataLoaders
*Just to make sure it works, not exported*

In [23]:
train_dl = df_train_timeseries.to_dataloader(train=True, batch_size=batch_size, num_workers=12)

val_dl = df_val_timeseries.to_dataloader(train=False, batch_size=batch_size, num_workers=12, shuffle=False)

test_dl = df_test_timeseries.to_dataloader(
    batch_size=test_set_size,
    num_workers=12,
    shuffle=False
)

### Export

In [24]:
from data_factory.prepared_data import TimeSeriesData

data_ts = TimeSeriesData(df_train_timeseries, df_val_timeseries, df_test_timeseries, test_set_size)

In [25]:
data_ts.export('../../data/save/timeseries_data.pkl')

#### Experiments

In [26]:
for X, (y, _) in df_train_timeseries:
    print(y.shape)
    print(X.keys())

    print(X['x_cont'].shape)

    break

torch.Size([5])
dict_keys(['x_cat', 'x_cont', 'encoder_length', 'decoder_length', 'encoder_target', 'encoder_time_idx_start', 'groups', 'target_scale'])
torch.Size([155, 31])
