In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, InputLayer
import numpy as np
import matplotlib.pyplot as plt
import joblib
import os
from tensorflow.keras.utils import Sequence

Creating a scaler based on a subset of the data

In [2]:
folder_path = 'C:/Users/camer/Documents/Masters Thesis/Data/Training data/1secbatch_quotelogs'

training_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

scaling_file = training_files[0]
scaling_data = pd.read_csv(scaling_file)
X_scaling = scaling_data.drop(columns=['quote_price', 'trader_type']).values
scaler = StandardScaler()
scaler.fit(X_scaling)

joblib.dump(scaler, 'scaler_quote_1sec_v2.joblib')

['scaler_quote_1sec_v2.joblib']

In [3]:
class DataGenerator(Sequence):
    def __init__(self, training_files, batch_size=1024, scaler=None, shuffle=False):
        self.training_files = training_files
        self.batch_size = batch_size
        self.scaler = scaler
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.training_files))
        self.current_file = None
        self.data_iterator = None
        self.on_epoch_end()

    def __len__(self):
        total_rows = sum([pd.read_csv(file, usecols=[0]).shape[0] for file in self.training_files])
        return int(np.ceil(total_rows / self.batch_size))

    def __getitem__(self, index):
        if self.data_iterator is None or self.current_file is None or not self.has_next_chunk():
            self.current_file = self.training_files.pop(0)
            self.data_iterator = pd.read_csv(self.current_file, chunksize=self.batch_size)

        chunk = next(self.data_iterator)
        X_batch = chunk.drop(columns=['quote_price', 'trader_type']).values
        y_batch = chunk['quote_price'].values

        if self.scaler:
            X_batch = self.scaler.transform(X_batch)

        X_batch = X_batch.reshape((-1, 1, X_batch.shape[1]))

        return X_batch.astype(np.float16), y_batch

    def has_next_chunk(self):
        try:
            _ = next(self.data_iterator)
            self.data_iterator = pd.read_csv(self.current_file, chunksize=self.batch_size)
            return True
        except StopIteration:
            return False

    def on_epoch_end(self):
        self.training_files = self.training_files + [self.current_file] if self.current_file else self.training_files
        if self.shuffle:
            np.random.shuffle(self.training_files)
        self.current_file = None
        self.data_iterator = None

In [4]:
train_files, test_files = train_test_split(training_files, test_size=0.1, random_state=42)

train_generator = DataGenerator(train_files, batch_size=1048576, scaler=scaler, shuffle=False)
test_generator = DataGenerator(test_files, batch_size=1048576, scaler=scaler, shuffle=False)

In [None]:
model = Sequential()
model.add(InputLayer(shape=(1, train_generator[0][0].shape[2]), dtype=tf.float16))
model.add(LSTM(10, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

history = model.fit(train_generator, epochs=20, validation_data=test_generator, verbose=1)

Epoch 1/20


  self._warn_if_super_not_called()


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - loss: 12078.2959

  self._warn_if_super_not_called()


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2428s[0m 6s/step - loss: 12077.8604 - val_loss: 12828.0908
Epoch 2/20


  self.gen.throw(typ, value, traceback)


In [None]:
joblib.dump(history.history, 'training_history_quote_log_v2.pkl')

In [None]:
model.save('Neural_network_models/quote_log_model_1sec_v2.keras')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()