In [163]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import h5py
import numpy as np
from typing import List, Tuple
from pathlib import Path

from alber.load_data import read_order_book, read_trades, read_target
from alber.feature_generation import (
    book_preprocessor, 
    get_features_zscore, 
    get_features_ma, 
    get_features_stoch,
    retime_trades,
    decrease_mem_consuming
)
from alber.wf_splitting_data import create_oot, walk_forward_splitting

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. Parameters

In [140]:
base = Path('../../Storage/alber')

count_test = 985_564

train_val_ratio = 0.1
val_size = 440_000
num_folds = 10
folds_path = Path('ret_1_10_folds_exp')

test_path = Path('ret_1_test_folds_1_exp')

# 1. Load order book and trades data

In [2]:
ob = read_order_book(base / Path('data.h5'))
trades = read_trades(base / Path('data.h5'))

  trades = pd.merge(trades, sum_w, on=['time'])
  trades = pd.merge(trades, sum_w, on=['time'])


In [3]:
ob.shape, trades.shape

((9871642, 9), (12522917, 4))

In [8]:
ob.head()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2
0,0.0,9787.75,9787.05,9792.39,9792.44,0.000264,0.045246,0.057008,0.639431
1,778.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7
2,1287.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7
3,1744.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706
4,2590.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706


In [62]:
trades.head()

Unnamed: 0,time,price,size,order_count,id,Ret_z,Sprd_z,Sprd_Up_z,Sprd_Down_z,Money,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
0,273.0,9783.854924,0.2251,1,0,,,,,2202.345743,...,,,,,,,,,,
1,966.0,9789.988042,0.014648,1,0,,,,,143.403745,...,,,,,,,,,,
2,997.0,9789.98,0.021673,1,0,,,,,212.178237,...,,,,,,,,,,
3,1287.0,9782.122815,0.357,2,0,,,,,3492.217845,...,,,,,,,,,,
4,1380.0,9789.9,0.001045,1,0,,,,,10.230445,...,,,,,,,,,,


# 2. Feature generation

Book data

In [10]:
ob.shape

(9871642, 9)

In [11]:
ob_f = book_preprocessor(ob)

In [12]:
ob.shape

(9871642, 35)

In [13]:
ob.head()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2,wap1,...,abs_volume_imbalance,rel_volume_ask_bid,rel_volume_ask_bid1,rel_volume_ask_bid2,rel_volume_ask,rel_volume_bid,bid_ask_w_spread1,bid_ask_w_spread2,bid_w_spread,bid_w_ask
0,0.0,9787.75,9787.05,9792.39,9792.44,0.000264,0.045246,0.057008,0.639431,9787.771388,...,0.650929,14.302988,214.939394,13.132321,-0.910846,-0.994165,215.041763,13.140104,-0.994165,-0.910846
1,778.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7,9786.394545,...,0.451857,0.393555,-0.166667,9.272515,0.285714,14.849023,-0.165993,9.281042,14.84904,0.285688
2,1287.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7,9786.394545,...,0.451857,0.393555,-0.166667,9.272515,0.285714,14.849023,-0.165993,9.281042,14.84904,0.285688
3,1744.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706,9789.293016,...,0.819862,1.111968,-0.915732,15.513253,-0.963757,6.102275,-0.915665,15.526589,6.102282,-0.963757
4,2590.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706,9789.293016,...,0.819862,1.111968,-0.915732,15.513253,-0.963757,6.102275,-0.915665,15.526589,6.102282,-0.963757


Trades data

In [15]:
trades['id'] = 0

In [16]:
%%time
trades = get_features_zscore(trades)

Ret
Sprd
Sprd_Up
Sprd_Down
CPU times: user 5h 47min 7s, sys: 2min 39s, total: 5h 49min 47s
Wall time: 5h 47min 38s


In [17]:
%%time
trades = get_features_ma(trades)

price
size
order_count
Money
CPU times: user 15h 26min 20s, sys: 5min 17s, total: 15h 31min 37s
Wall time: 15h 31min 41s


In [18]:
%%time
trades = get_features_stoch(trades)

CPU times: user 11h 36min 32s, sys: 2min 21s, total: 11h 38min 54s
Wall time: 11h 38min 57s


In [63]:
trades.head()

Unnamed: 0,time,price,size,order_count,id,Ret_z,Sprd_z,Sprd_Up_z,Sprd_Down_z,Money,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
0,273.0,9783.854924,0.2251,1,0,,,,,2202.345743,...,,,,,,,,,,
1,966.0,9789.988042,0.014648,1,0,,,,,143.403745,...,,,,,,,,,,
2,997.0,9789.98,0.021673,1,0,,,,,212.178237,...,,,,,,,,,,
3,1287.0,9782.122815,0.357,2,0,,,,,3492.217845,...,,,,,,,,,,
4,1380.0,9789.9,0.001045,1,0,,,,,10.230445,...,,,,,,,,,,


In [22]:
trades.to_parquet(
    base / Path('trades.parquet.gzip'),
    compression="gzip",
)

Rescale trades feature to book time

In [24]:
trades_2 = retime_trades(trades, ob)

In [25]:
trades_2.shape

(9871642, 88)

In [26]:
trades_2.head()

Unnamed: 0,time,price,size,order_count,id,Ret_z,Sprd_z,Sprd_Up_z,Sprd_Down_z,Money,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
0,0.0,,,,-1,,,,,,...,,,,,,,,,,
1,778.0,9783.854924,0.2251,1.0,-1,,,,,2202.345743,...,,,,,,,,,,
2,1287.0,9789.98,0.021673,1.0,-1,,,,,212.178237,...,,,,,,,,,,
3,1744.0,9789.9,0.001045,1.0,-1,,,,,10.230445,...,,,,,,,,,,
4,2590.0,9789.86,0.01,1.0,-1,,,,,97.8986,...,,,,,,,,,,


In [27]:
ob.head()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2,wap1,...,rel_volume_ask_bid,rel_volume_ask_bid1,rel_volume_ask_bid2,rel_volume_ask,rel_volume_bid,bid_ask_w_spread1,bid_ask_w_spread2,bid_w_spread,bid_w_ask,id
0,0.0,9787.75,9787.05,9792.39,9792.44,0.000264,0.045246,0.057008,0.639431,9787.771388,...,14.302988,214.939394,13.132321,-0.910846,-0.994165,215.041763,13.140104,-0.994165,-0.910846,-1
1,778.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7,9786.394545,...,0.393555,-0.166667,9.272515,0.285714,14.849023,-0.165993,9.281042,14.84904,0.285688,-1
2,1287.0,9782.08,9782.07,9789.99,9790.19,1.08,0.068143,0.9,0.7,9786.394545,...,0.393555,-0.166667,9.272515,0.285714,14.849023,-0.165993,9.281042,14.84904,0.285688,-1
3,1744.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706,9789.293016,...,1.111968,-0.915732,15.513253,-0.963757,6.102275,-0.915665,15.526589,6.102282,-0.963757,-1
4,2590.0,9782.09,9782.08,9789.9,9789.98,0.646307,0.091,0.054463,1.502706,9789.293016,...,1.111968,-0.915732,15.513253,-0.963757,6.102275,-0.915665,15.526589,6.102282,-0.963757,-1


In [28]:
trades.head()

Unnamed: 0,time,price,size,order_count,id,Ret_z,Sprd_z,Sprd_Up_z,Sprd_Down_z,Money,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
0,273.0,9783.854924,0.2251,1,0,,,,,2202.345743,...,,,,,,,,,,
1,966.0,9789.988042,0.014648,1,0,,,,,143.403745,...,,,,,,,,,,
2,997.0,9789.98,0.021673,1,0,,,,,212.178237,...,,,,,,,,,,
3,1287.0,9782.122815,0.357,2,0,,,,,3492.217845,...,,,,,,,,,,
4,1380.0,9789.9,0.001045,1,0,,,,,10.230445,...,,,,,,,,,,


In [75]:
ob = ob.drop(['id'], axis=1)
trades_2 = trades_2.drop(['id'], axis=1)
features = pd.merge(ob, trades_2, on=['time'])

In [76]:
features.time.unique().shape, features.shape

((9855646,), (9903634, 121))

In [77]:
features.tail()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2,wap1,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
9903629,6479994731,8233.12,8230.8,8233.4,8234.5,0.048584,0.001,0.5,1.230785,8233.144798,...,2.999965,1.721196,36.99647,0.046523,1.757695,16.977043,0.103533,0.899231,16.718701,0.053786
9903630,6479995854,8229.38,8229.26,8233.13,8233.2,0.4488,0.001,0.642737,0.63,8230.921863,...,1.151818,39.93357,47.075554,0.848285,37.449804,27.986023,1.338156,18.000748,21.227038,0.848006
9903631,6479996824,8230.6,8229.7,8233.03,8233.12,0.25,0.375,0.064,1.051843,8232.534713,...,1.151818,39.93357,47.075554,0.848285,37.449804,27.986023,1.338156,18.000748,21.227038,0.848006
9903632,6479997863,8231.8,8231.7,8232.42,8233.02,0.61,0.015,0.014595,0.252026,8232.405512,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739
9903633,6479998786,8231.84,8231.8,8232.41,8232.42,1.245423,0.49,0.243066,0.014595,8232.316921,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739


In [78]:
features = features.astype({'time': int})
features = features.drop_duplicates(['time']).reset_index(drop=True)

In [79]:
features.tail()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2,wap1,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
9855641,6479994731,8233.12,8230.8,8233.4,8234.5,0.048584,0.001,0.5,1.230785,8233.144798,...,2.999965,1.721196,36.99647,0.046523,1.757695,16.977043,0.103533,0.899231,16.718701,0.053786
9855642,6479995854,8229.38,8229.26,8233.13,8233.2,0.4488,0.001,0.642737,0.63,8230.921863,...,1.151818,39.93357,47.075554,0.848285,37.449804,27.986023,1.338156,18.000748,21.227038,0.848006
9855643,6479996824,8230.6,8229.7,8233.03,8233.12,0.25,0.375,0.064,1.051843,8232.534713,...,1.151818,39.93357,47.075554,0.848285,37.449804,27.986023,1.338156,18.000748,21.227038,0.848006
9855644,6479997863,8231.8,8231.7,8232.42,8233.02,0.61,0.015,0.014595,0.252026,8232.405512,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739
9855645,6479998786,8231.84,8231.8,8232.41,8232.42,1.245423,0.49,0.243066,0.014595,8232.316921,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739


In [80]:
features.shape

(9855646, 121)

In [81]:
features.dtypes

time                       int64
bid_price1               float64
bid_price2               float64
ask_price1               float64
ask_price2               float64
                          ...   
stoch_d_size_21_3        float64
rel_stoch_size_21_1_3    float64
stoch_k_size_42_1        float64
stoch_d_size_42_3        float64
rel_stoch_size_42_1_3    float64
Length: 121, dtype: object

In [82]:
features = decrease_mem_consuming(features, ['time'])

In [83]:
features.dtypes

time                       int64
bid_price1               float32
bid_price2               float32
ask_price1               float32
ask_price2               float32
                          ...   
stoch_d_size_21_3        float32
rel_stoch_size_21_1_3    float32
stoch_k_size_42_1        float32
stoch_d_size_42_3        float32
rel_stoch_size_42_1_3    float32
Length: 121, dtype: object

In [84]:
features.tail()

Unnamed: 0,time,bid_price1,bid_price2,ask_price1,ask_price2,bid_size1,bid_size2,ask_size1,ask_size2,wap1,...,rel_stoch_price_42_1_3,stoch_k_size_14_1,stoch_d_size_14_3,rel_stoch_size_14_1_3,stoch_k_size_21_1,stoch_d_size_21_3,rel_stoch_size_21_1_3,stoch_k_size_42_1,stoch_d_size_42_3,rel_stoch_size_42_1_3
9855641,6479994731,8233.120117,8230.799805,8233.400391,8234.5,0.048584,0.001,0.5,1.230785,8233.144531,...,2.999965,1.721196,36.996471,0.046523,1.757695,16.977043,0.103533,0.899231,16.7187,0.053786
9855642,6479995854,8229.379883,8229.259766,8233.129883,8233.200195,0.4488,0.001,0.642737,0.63,8230.921875,...,1.151818,39.933571,47.075554,0.848285,37.449802,27.986023,1.338156,18.000748,21.227037,0.848006
9855643,6479996824,8230.599609,8229.700195,8233.030273,8233.120117,0.25,0.375,0.064,1.051843,8232.535156,...,1.151818,39.933571,47.075554,0.848285,37.449802,27.986023,1.338156,18.000748,21.227037,0.848006
9855644,6479997863,8231.799805,8231.700195,8232.419922,8233.019531,0.61,0.015,0.014595,0.252026,8232.405273,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739
9855645,6479998786,8231.839844,8231.799805,8232.410156,8232.419922,1.245423,0.49,0.243066,0.014595,8232.317383,...,0.182523,12.129506,17.928091,0.676561,11.346537,16.851345,0.673327,5.526459,8.142146,0.678739


In [85]:
features.to_parquet(
    base / Path('features.parquet.gzip'),
    compression="gzip",
)

# 3. Create vitrine (split into train/test)

In [113]:
features.shape

(9855646, 121)

In [114]:
vitrine = features[['time']]
target = read_target(base / Path('result.h5'))
vitrine = pd.merge(vitrine, target, on=['time'])

vitrine = create_oot(vitrine, count_test)

Splitting vitrine into train/test:
train_min_time == 0, train_max_time == 5838737647
test_min_time == 5838738540, test_max_time == 6479998786
unique_times_train == 8870082, 
unique_times_test == 985564, 



In [115]:
vitrine.shape

(9855646, 3)

In [116]:
vitrine.query('segment == "train"').shape, vitrine.query('segment == "test"').shape

((8870082, 3), (985564, 3))

In [124]:
8870082 // 2, 8870082 // 2 // 10, 8870082 // 2 // 10 * 3

(4435041, 443504, 1330512)

In [117]:
vitrine.to_parquet(
    base / Path('vitrine.parquet.gzip'),
    compression="gzip",
)

# 4. Split into folds

In [148]:
base_vitrine = vitrine.query('segment == "train"').reset_index(drop=True)

walk_forward_splitting(
    base_vitrine,
    train_val_ratio,
    val_size,
    num_folds,
    base,
    folds_path,
)

fold №1
train::
 len == 4023074, min == 0, max == 2843965736
val-train::
 len == 447008, min == 2668, max == 2843934808
val::
 len == 440000, min == 2843966657, max == 3141326865



fold №2
train::
 len == 4419074, min == 0, max == 3141326865
val-train::
 len == 491008, min == 13784, max == 3141326232
val::
 len == 440000, min == 3141327748, max == 3427921761



fold №3
train::
 len == 4815074, min == 0, max == 3427921761
val-train::
 len == 535008, min == 17802, max == 3427916644
val::
 len == 440000, min == 3427922491, max == 3724779876



fold №4
train::
 len == 5211074, min == 0, max == 3724779876
val-train::
 len == 579008, min == 14856, max == 3724778889
val::
 len == 440000, min == 3724780909, max == 4045110033



fold №5
train::
 len == 5607074, min == 0, max == 4045110033
val-train::
 len == 623008, min == 3736, max == 4045104673
val::
 len == 440000, min == 4045110540, max == 4327365709



fold №6
train::
 len == 6003074, min == 0, max == 4327365709
val-train::
 len == 667008

In [149]:
base_vitrine = vitrine.reset_index(drop=True)

walk_forward_splitting(
    base_vitrine,
    train_val_ratio,
    count_test,
    1,
    base,
    test_path,
)

fold №1
train::
 len == 7983074, min == 0, max == 5838737647
val-train::
 len == 887008, min == 5780, max == 5838736765
val::
 len == 985564, min == 5838738540, max == 6479998786





In [None]:
Splitting vitrine into train/test:
train_min_time == 0, train_max_time == 5838737647
test_min_time == 5838738540, test_max_time == 6479998786
unique_times_train == 8870082, 
unique_times_test == 985564, 