In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [105]:
def read_and_clean(path):
    
   
    df = pd.read_csv(path)


    # convert pandas df to datetime column from 09:15:00+02:00
    # df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S%z')
    del df['time']

    # drop columns with only NaN values
    df = df.dropna(axis=1, how='all')
    # drop rows with only NaN values
    df = df.dropna(axis=0, how='all')
    # drop rows containing any nan values
    df = df.dropna(axis=0, how='any')

    df.reset_index(inplace=True, drop=True)

    # convert to datetime from 5:34:11.166472
    df['receivetime'] = pd.to_datetime(df['receivetime'], format='%H:%M:%S.%f')

    
    df['time_diff'] = df['receivetime'].diff().dt.total_seconds()
    
    


    # get rows where the receivetime difference to the last row is larger than 30 seconds
    mask = (df['time_diff'] > 30)

    filler_rows = []
    # loop over rows in the mask with iterrows
    for index, row in df[mask].iterrows():
        # <- last 5 seconds
        # <- last index
        # <- gap 
        # <- index

        # insert average of last 5 seconds in the gap, divide gap into chunks
        # each chunk has a random gaus time difference to last row of last 5 seconds

        last_index = index - 1
        # select the rows in the last n seconds using the index
        seconds_mean = 15
        seconds_timediff = 100
        mask_last_n_seconds_for_weighted_mean = (df['receivetime'] > df['receivetime'][last_index] - pd.Timedelta(seconds=seconds_mean)) & (df['receivetime'] <= df['receivetime'][last_index])
        mask_last_n_seconds_for_gauss_timediff = (df['receivetime'] > df['receivetime'][last_index] - pd.Timedelta(seconds=seconds_timediff)) & (df['receivetime'] <= df['receivetime'][last_index])

        # select columns which dont have the name receivetime, time, or time_diff
        columns = [col for col in df.columns if col not in ['receivetime', 'time_diff']]

        # get exponentially weighted moving average for rows
        last_n_seconds_exp_weighted_mean = df[mask_last_n_seconds_for_weighted_mean][columns].ewm(span=df[mask_last_n_seconds_for_weighted_mean].shape[0]).mean()
        last_n_seconds_exp_weighted_mean = last_n_seconds_exp_weighted_mean.iloc[-1]
        last_n_seconds_timediffs = df[mask_last_n_seconds_for_gauss_timediff]['time_diff'].tolist()
        
        

        
        time = df.iloc[last_index]['receivetime']
        end_time = df.iloc[index]['receivetime']
        while True:
            # fit a gaussian distribution over the time differences of the last n seconds
            
            gaussian = 0
            while gaussian <= 0:
                gaussian = np.random.normal(np.mean(last_n_seconds_timediffs), np.std(last_n_seconds_timediffs), 1)[0]
            
            time = time + pd.Timedelta(seconds=gaussian)
            if time>= end_time:
                break
            else:
                row = last_n_seconds_exp_weighted_mean.copy()
                row['receivetime'] = time
                # add row to temp_df
                filler_rows.append(row)

        
    # merge temp_df with df
    del df['time_diff']
    filler_df = pd.DataFrame(filler_rows).reset_index(drop=True)
    # convert all rows which are int64 in df to int64 in filler_df
    for col in df.columns:
        if df[col].dtype == 'int64':
            filler_df[col] = filler_df[col].astype('int64')

    df = pd.merge(df, filler_df, how='outer')
    # sort df by receivetime
    df = df.sort_values(by=['receivetime'])
    # reset index
    df = df.reset_index(drop=True)


    # normalize all dataframe columns containing floats to range -1 1 in a for loop
    # df_raw = df.copy()
    df['receivetime'] = df['receivetime'].dt.hour / 24 + df['receivetime'].dt.minute / (24 * 60) + df['receivetime'].dt.second / (24 * 60 * 60) + df['receivetime'].dt.microsecond / (24 * 60 * 60 * 1000000)
    df['receivetime'] = (df['receivetime'] * 2) -1
    
    

    for col in df.columns:
        if col == 'receivetime':
            pass
        elif 'Offers' in col:
            pass
        else: # Price and Volume
            df[col] = (df[col] - df[col].mean()) / df[col].std()
            # df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) *2 -1
    return df

def pick_random_subsequences(df,n, len_x, len_y, predict_features):
    """
    df: dataframe
    n: number of subsequences to pick
    seq_len: length of each subsequence
    """
    # get random start indices
    start_indices = np.random.randint(0, df.shape[0] - (len_x+len_y), n)
    # get random end indices
    end_indices = start_indices + (len_x+len_y)
    # get the subsequence for each start and end index
    sequences = [df.iloc[start:end].values.tolist() for start, end in zip(start_indices, end_indices)]
    sequences = np.array(sequences)
    feature_indices = [df.columns.get_loc(col) for col in predict_features]

    X = sequences[:, :len_x, :]
    Y = sequences[:, len_x:, feature_indices]
    return X, Y


def make_train_dev_set(train_csvs, dev_csvs, num_subsequences_per_csv, len_x, len_y, predict_features:list[str], batch_size):
    assert num_subsequences_per_csv > batch_size, "num_subsequences_per_csv must be greater than batch_size"

    
    dfs_train = [read_and_clean(csv) for csv in train_csvs ] 
    dfs_dev = [read_and_clean(csv) for csv in dev_csvs ]

    def data_generator(dfs):
        
        for _ in range(num_subsequences_per_csv//batch_size+1):
            X, Y = [], []
            for df in dfs:
                Xdf, Ydf = pick_random_subsequences(df, batch_size, len_x, len_y, predict_features)
                X.append(Xdf)
                Y.append(Ydf)
            random_order = np.random.permutation(len(X))
            X = np.concatenate(X)[random_order]
            Y = np.concatenate(Y)[random_order]
            for x, y in zip(X, Y):
                yield x, y



    train_dataset = tf.data.Dataset.from_generator(lambda: data_generator(dfs_train), output_types=(tf.float32, tf.float32))
    dev_dataset = tf.data.Dataset.from_generator(lambda: data_generator(dfs_dev), output_types=(tf.float32, tf.float32))

    return train_dataset, dev_dataset

csvs = [
  
"xetrworker1/SAPSE_20042023DMY.csv",
"xetrworker1/SAPSE_21042023DMY.csv",
"xetrworker1/SAPSE_24042023DMY.csv",
"xetrworker1/SAPSE_25042023DMY.csv",
"xetrworker1/SAPSE_26042023DMY.csv",
"xetrworker1/SAPSE_27042023DMY.csv",
"xetrworker1/SAPSE_28042023DMY.csv",
]

Ty = 10
Tx = 100
predict_features = ['aPrice1','aUnit1','askOffers1','bidPrice1','bidUnit1','bidOffers1']
num_subsequences_per_csv = 100
# given the last Tx we want to predict the next Ty predict_features

batch_size = 32

train_csvs = csvs[:-1]
dev_csvs = csvs[-1:]
steps_per_epoch = (num_subsequences_per_csv*len(train_csvs))//batch_size
train, dev = make_train_dev_set(train_csvs, dev_csvs, num_subsequences_per_csv, Tx, Ty, predict_features, batch_size)

In [92]:
first_batch = next(train.batch(batch_size).take(1).as_numpy_iterator())
x, y = first_batch

2023-05-02 20:56:44.495555: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


In [93]:
x.shape, y.shape

((2, 100, 61), (2, 10, 6))

In [94]:
x_dim = x.shape[-1]
y_dim = y.shape[-1]

In [95]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [96]:
activation_dim = 32
reshaper_enc = Reshape((1, x_dim))
reshaper_dec = Reshape((1, y_dim))
LSTM_cell_encoder = LSTM(activation_dim, return_state = True)
LSTM_cell_decoder = LSTM(activation_dim, return_state = True)
densor = Dense(y_dim, activation='linear')

In [97]:
def lstm_model(Tx, Ty, LSTM_cell_encoder, LSTM_cell_decoder, densor, reshaper_enc, reshaper_dec, x_dim):
   
    activation_dim = LSTM_cell_encoder.units

    X = Input(shape=(Tx, x_dim))
    a0 = Input(shape=(activation_dim,), name='a')
    c0 = Input(shape=(activation_dim,), name='c')
    a = a0
    c = c0
    
    outputs = []

    # encoder 
    for t in range(Tx):
        x = X[:,t,:]
        x = reshaper_enc(x)
        a, _, c = LSTM_cell_encoder(x, initial_state=[a, c])
      
      
    last_y_out = densor(a)
    outputs.append(last_y_out)
    last_y_out = reshaper_dec(last_y_out)
   
    
    # Y = Input(shape=(Ty, y_dim))
    
    for t in range(Ty-1):
        a, _, c = LSTM_cell_decoder(last_y_out, initial_state=[a, c])
        last_y_out = densor(a)
        
        
        outputs.append(last_y_out)
        last_y_out = reshaper_dec(last_y_out)
    
   
    outputs = tf.transpose(outputs,[1,0,2])

  
        

    model = Model(inputs=[X, a0, c0], outputs=outputs)
    return model

In [98]:
model = lstm_model(Tx, Ty, LSTM_cell_encoder, LSTM_cell_decoder, densor, reshaper_enc, reshaper_dec, x_dim)

2023-05-02 20:56:45.228834: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-02 20:56:45.232849: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-02 20:56:45.235830: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [99]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 100, 61)]    0           []                               
                                                                                                  
 tf.__operators__.getitem_319 (  (None, 61)          0           ['input_7[0][0]']                
 SlicingOpLambda)                                                                                 
                                                                                                  
 tf.__operators__.getitem_318 (  (None, 61)          0           ['input_7[0][0]']                
 SlicingOpLambda)                                                                                 
                                                                                            

In [100]:
model.output_shape

(None, 10, 6)

In [101]:
x,y = train.batch(batch_size).take(1).as_numpy_iterator().next()

2023-05-02 20:57:30.308969: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


In [102]:
x.shape, y.shape

((2, 100, 61), (2, 10, 6))

In [103]:
from tensorflow.keras.optimizers.legacy import Adam
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])

  super().__init__(name, **kwargs)


In [106]:


def generator(dataset):
    for x, y in dataset.repeat().batch(batch_size):
        a0 = np.zeros((batch_size, activation_dim))
        c0 = np.zeros((batch_size, activation_dim))
        yield [x, a0, c0], y

model.fit(generator(train), epochs=50, verbose = 1, steps_per_epoch = steps_per_epoch)

2023-05-02 21:03:34.597880: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1/50


2023-05-02 21:03:35.908584: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/50
Epoch 3/50
Epoch 4/50

: 

: 