In [96]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, GRU, Dense, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [123]:
x_train_path = '../data/X_train_N1.csv'
y_train_path = '../data/y_train_or.csv'

X_train = pd.read_csv(x_train_path)
Y_train = pd.read_csv(y_train_path)

In [124]:
X_train = X_train.sample(n=30000, random_state=1)

In [125]:
X_train.head(3)

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
2257520,22575,5,17,D,B,-0.01,0.0,0.02,400,84,False,-10
14536058,145360,4,44,D,B,0.01,0.01,0.03,2368,400,False,-100
4987443,49874,1,35,A,B,-0.04,-0.01,0.0,955,505,False,100


In [126]:
set(X_train['side'])

{'A', 'B'}

In [127]:
# One-Hot Encode 'action' and 'side' columns 
action_encoded = pd.get_dummies(X_train[['action']], dtype='int')
side_encoded = pd.get_dummies(X_train[['side']], dtype='int')

# Drop the original 'action' and 'side' columns and concatenate the one-hot encoded columns
X_train = pd.concat([X_train, action_encoded], axis=1)
X_train = pd.concat([X_train, side_encoded], axis=1)
X_train['trade'] = X_train['trade'].astype(int)
X_train = X_train.drop(['action'], axis=1)
X_train = X_train.drop(['side'], axis=1)

In [130]:
X_train.head()

Unnamed: 0,obs_id,venue,order_id,price,bid,ask,bid_size,ask_size,trade,flux,action_A,action_D,action_U,side_A,side_B
2257520,22575,5,17,-0.01,0.0,0.02,400,84,0,-10,0,1,0,0,1
14536058,145360,4,44,0.01,0.01,0.03,2368,400,0,-100,0,1,0,0,1
4987443,49874,1,35,-0.04,-0.01,0.0,955,505,0,100,1,0,0,0,1
13955863,139558,2,38,0.12,0.0,0.06,329,412,0,-1,0,1,0,1,0
522717,5227,4,14,-0.05,0.0,0.01,810,542,0,-100,0,1,0,0,1


In [None]:
# Check the distribution of obs_id
obs_id_counts = X_train['obs_id'].value_counts()
print("Distribution of events per obs_id:\n", obs_id_counts.describe())

# Print a sample of the obs_id counts to see the distribution
print("Sample of obs_id counts:\n", obs_id_counts.head(20))


Distribution of events per obs_id:
 count    27421.000000
mean         1.094052
std          0.309728
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: count, dtype: float64
Sample of obs_id counts:
 obs_id
41344     4
86543     4
31545     4
19025     4
150190    4
117513    4
133680    4
154434    4
62246     3
33046     3
17284     3
131128    3
147068    3
116094    3
91031     3
46092     3
112695    3
50955     3
138668    3
85587     3
Name: count, dtype: int64


In [121]:
# Reattach the obs_id column to the DataFrame
obs_id = X_train['obs_id']
X_train['obs_id'] = obs_id.values

In [132]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_features(df, max_sequence_length=4):
    # Group by obs_id and collect sequences
    grouped = df.groupby('obs_id')
    features = []

    for name, group in grouped:
        features.append(group.drop(columns=['obs_id']).values.tolist())
    
    # Pad sequences to the same length
    features_padded = pad_sequences(features, maxlen=max_sequence_length, padding='post', dtype='float32')

    return np.array(features_padded)

# Run the adjusted preprocessing step with padding
X_train_processed = preprocess_features(X_train, max_sequence_length=4)

#  Check the number of sequences
print(f"Number of sequences processed: {len(X_train_processed)}")

Number of sequences processed: 27421


In [133]:
# Preprocess the target variable
def preprocess_target(Y_train, num_sequences):
    return Y_train['eqt_code_cat'].values[:num_sequences]

Y_train_processed = preprocess_target(Y_train, len(X_train_processed))

In [134]:
# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_processed, Y_train_processed, test_size=0.2, random_state=42)


In [139]:
X_train[:5]

array([[[ 0.000e+00,  4.200e+01,  1.200e-01,  0.000e+00,  5.000e-02,
          4.120e+02,  5.270e+02,  0.000e+00,  1.000e+02,  1.000e+00,
          0.000e+00,  0.000e+00,  1.000e+00,  0.000e+00],
        [ 0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00],
        [ 0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00],
        [ 0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
          0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00]],

       [[ 4.000e+00,  4.800e+01, -8.000e-02,  1.000e-02,  5.000e-02,
          1.500e+02,  2.500e+02,  0.000e+00, -1.000e+02,  0.000e+00,
          1.000e+00,  0.000e+00,  0.000e+00,  1.000e+00],
        [ 0.000e+0

In [140]:
# Build the model using Sequential API
input_shape = (4, X_train.shape[2])
num_classes = 24

model = Sequential()

# Adding GRU layers
model.add(Input(shape=input_shape))
model.add(GRU(64, return_sequences=True))
model.add(GRU(64, return_sequences=True, go_backwards=True))

# Flattening the output from the GRU layers
model.add(tf.keras.layers.Flatten())

# Adding Dense layers
model.add(Dense(64, activation='selu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=3e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [143]:
# Train the model
batch_size = 1000
epochs = 18  # Adjust the number of epochs as necessary

model.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size=batch_size, epochs=epochs)

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x2590343c8e0>

In [144]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, Y_val)
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.0494
