In [1]:
import numpy as np
from tqdm import tqdm
import keras
from keras import Input, Model
from keras.layers import Layer
from keras import backend as K
from keras import layers
from keras.preprocessing import timeseries_dataset_from_array

# from sklearn.metrics import accuracy_score, classification_report
# from keras.callbacks import ModelCheckpoint

from LobTransformer import TransformerBlock

In [2]:
# Load data

# download FI2010 dataset from 
# https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649
FI2010_DIR = r'D:\WORKS\translob\dataset\BenchmarkDatasets'

def gen_data(data, horizon):
    x = data[:40, :].T  # 40 == 10 price + volume asks + 10 price + volume bids
    # FIXME: delete .T
    y = data[-5 + horizon, :].T  # 5
    return x[:-1], (y[1:] - 1).astype(np.int32)  # shift y by 1


def load_dataset(horizon):
    dec_data = np.loadtxt(
        f'{FI2010_DIR}/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Training/Train_Dst_NoAuction_ZScore_CF_7.txt'
    )
    dec_train = dec_data[:, :int(np.floor(dec_data.shape[1] * 0.8))]
    dec_val = dec_data[:, int(np.floor(dec_data.shape[1] * 0.8)):]

    dec_test1 = np.loadtxt(
        f'{FI2010_DIR}/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_7.txt'
    )
    dec_test2 = np.loadtxt(
        f'{FI2010_DIR}/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_8.txt'
    )
    dec_test3 = np.loadtxt(
        f'{FI2010_DIR}/NoAuction/1.NoAuction_Zscore/NoAuction_Zscore_Testing/Test_Dst_NoAuction_ZScore_CF_9.txt'
    )
    dec_test = np.hstack((dec_test1, dec_test2, dec_test3))
    result = (
        gen_data(dec_train,horizon), 
        gen_data(dec_val,horizon), 
        gen_data(dec_test, horizon)) #yapf: disable
    return result

(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_dataset(horizon=4)

In [3]:
# test of fi2020 module from start
# a = np.array([
#     2615, 353, 2618, 200, 2619, 164, 2620, 532, 2621, 151, 2623, 837, 2625,
#     150, 2626, 787, 2629, 146, 2633, 311, 2606, 326, 2604, 682, 2602, 786,
#     2600, 893, 2599, 159, 2595, 100, 2593, 143, 2591, 134, 2588, 123, 2579, 128
# ])
# x_train[0]*a+a

array([3482.5004661 ,  152.23163124, 3474.47744336,   93.861558  ,
       3492.09621333,   43.0617752 , 3475.4918844 ,  304.13057864,
       3493.90835021,   23.33160192, 3477.74077363,  371.2583808 ,
       3498.510855  ,   37.064136  , 3479.84899444,  315.66830497,
       3503.08739616,    8.46771822, 3489.91982907,   44.21806086,
       3473.94724562,  172.09740816, 3444.77629104,  153.94877388,
       3469.64402948,  256.30380036, 3438.396936  ,  252.912781  ,
       3463.57411355,  105.88077597, 3431.21806785,   39.043824  ,
       3458.11757754,   76.03908312, 3423.25964425,   61.09721826,
       3455.97596592,   71.14003521, 3389.51349707,   64.3653376 ])

In [3]:
A=()

In [4]:
class PositionalEncodingLayer(Layer):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, x, *args, **kwargs):
        steps, d_model = x.get_shape()[-2:]
        global A 
        A= x,steps,d_model
        ps = np.zeros([steps, 1], dtype=K.floatx())
        for step in range(steps):
            ps[step, :] = [(2 / (steps - 1)) * step - 1]

        ps_expand = K.expand_dims(K.constant(ps), axis=0)
        ps_tiled = K.tile(ps_expand, [K.shape(x)[0], 1, 1])

        x = K.concatenate([x, ps_tiled], axis=-1)
        return x


def eval(model, X_test, y_test, **kwargs):
    ts = TimeseriesGenerator(X_test,
                             y_test,
                             kwargs.get('sequence_length', 100),
                             batch_size=32,
                             shuffle=False)
    y_true = np.concatenate([y for x, y in ts])
    y_pred = np.argmax(model.predict(ts), -1)
    print(classification_report(y_true, y_pred))
    return classification_report(y_true, y_pred,
                                 output_dict=True)['weighted avg']['f1-score']


In [5]:
# Parametrs
seq_len = 100
n_filters = 14
dilation_steps = 4 # max dilation = 2**dilation_steps
attention_heads = 1
blocks = 2
share_weights = True
dropout_rate = 0.1
lr = 0.0001
adam_beta1 = 0.9
adam_beta2 = 0.999
batch_size = 512
epochs = 150

In [6]:
# Model
inputs = Input(shape=(seq_len, 40))
x = inputs
for dilation in [2**dilation for dilation in range(dilation_steps + 1)]:
    x = layers.Conv1D(
        n_filters,
        kernel_size=2,
        dilation_rate=dilation,
        activation='relu',
        padding='causal',
    )(x)

norm = layers.LayerNormalization()(x)
x=norm
pos = PositionalEncodingLayer()(x)
x = pos
tb = TransformerBlock(
    'tb1',
    attention_heads,
    True,
)
blocks = blocks
for block in range(blocks):
    if share_weights:
        x = tb(x)
    else:
        x = TransformerBlock(
            f'transformer_block_{block}',
            attention_heads,
            True,
        )(x)

x = layers.Flatten()(x)
x = layers.Dense(64,
                 activation='relu',
                 kernel_regularizer='l2',
                 kernel_initializer='glorot_uniform')(x)
x = layers.Dropout(dropout_rate)(x)
out = layers.Dense(3, activation='softmax')(x)
model = Model(inputs=inputs, outputs=out)
model.summary()

model.compile(
    keras.optimizers.Adam(
        learning_rate=lr,
        beta_1=adam_beta1,
        beta_2=adam_beta2,
        name="Adam",
    ),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=['sparse_categorical_accuracy'],
)
print(
    'Train',
    x_train.shape,
    y_train.shape,
    'Val',
    x_val.shape,
    y_val.shape,
)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100, 40)]            0         []                            
                                                                                                  
 conv1d (Conv1D)             (None, 100, 14)              1134      ['input_1[0][0]']             
                                                                                                  
 conv1d_1 (Conv1D)           (None, 100, 14)              406       ['conv1d[0][0]']              
                                                                                                  
 conv1d_2 (Conv1D)           (None, 100, 14)              406       ['conv1d_1[0][0]']            
                                                                                              

In [8]:
# train_gen = TimeseriesGenerator(
#     data= x_train,
#     targets=y_train,
#     length=seq_len,
#     shuffle=True,
#     batch_size=batch_size,
# )
# val_gen = TimeseriesGenerator(
#     data= x_val,
#     targets=y_val,
#     length=seq_len,
#     batch_size=batch_size,
# )

In [7]:
import tensorflow as tf

In [104]:
v=10

ds_train = timeseries_dataset_from_array(
    data=x_train,
    targets=y_train,
    batch_size=v,
    sequence_length=seq_len,
    # shuffle=True,
)
# x= timeseries_dataset_from_array(x_train,None,

#     batch_size=2,
#     sequence_length=seq_len,
# )
# y =timeseries_dataset_from_array(y_train,None,
#     batch_size=2,
#     sequence_length=seq_len,
# )
# ds_val = timeseries_dataset_from_array(
#     data=x_val,
#     targets=y_val,
#     batch_size=batch_size,
#     sequence_length=seq_len,
#     shuffle=True,
# )

# Not working:
def foo(x,self):
    return tf.ensure_shape(x, [None,v,40])
ds_train.map(
    foo
    )
count={'step':0,'error':0}
for i in ds_train:
    if i[0].shape!=[v,100,40]:count['error']+=1
    count['step']+=1
    n= i
    # if count>396: break
count

{'step': 20370, 'error': 0}

In [21]:
def foo(x):
    if len(x.shape)==1:
       gen_dim =1
    else:
        gen_dim = x.shape[-1]
    
    line = range(len(x) - 100)
    iterable = (x[i:i + 100] for i in line)
    a = np.fromiter(
        iter=iterable,
        dtype=(np.float64, (100, gen_dim)),
        count=len(line),
    )
    
    line = range(0, len(a) - 512 + 1, 512)
    iterable = (a[i:i + 512] for i in line)
    b = np.fromiter(
        iter=iterable,
        dtype=(np.float64, (512, 100, gen_dim)),
        count=len(line),
    )
    return b

In [22]:
by =foo(y_train)

ValueError: Must specify length when using variable-size data-type.

In [12]:
b= foo(x_train)

In [8]:
train_gen = keras.preprocessing.sequence.TimeseriesGenerator(
    data= x_train,
    targets=y_train,
    length=seq_len,
    shuffle=True,
    batch_size=batch_size,
)
val_gen = keras.preprocessing.sequence.TimeseriesGenerator(
    data= x_val,
    targets=y_val,
    length=seq_len,
    batch_size=batch_size,
)

X_train,Y_train = train_gen[0]
dataset = tqdm(iterable=train_gen)
num = 0
for i in dataset:
    X_train = np.vstack([X_train,i[0]])
    Y_train = np.hstack([Y_train,i[1]])
    num +=1
    if num>10: break

  3%|▎         | 10/398 [00:00<00:25, 15.22it/s]


In [9]:
model.fit(
    x=X_train,
    y=Y_train,
    batch_size=batch_size,
    epochs=epochs,
    # validation_data=ds_val,
    # callbacks=[
    #     tf.keras.callbacks.TensorBoard(
    #         log_dir=("logs/scalars/" +
    #                  datetime.now().strftime("%Y%m%d-%H%M%S")), ),
    #     tf.keras.callbacks.EarlyStopping(
    #         monitor='val_sparse_categorical_accuracy',
    #         mode='max',
    #         patience=10,
    #         min_delta=0.0002,
    #     ),
    #     # ModelCheckpoint('mdl.hdf5', save_best_only=True, monitor='val_loss', mode='min'),
    # ],
)

Epoch 1/150
Epoch 2/150

KeyboardInterrupt: 

In [None]:
A