In [None]:
#IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.preprocessing import StandardScaler

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#OPENING DATA FROM DRIVE
path = '/content/drive/My Drive/Capstone/data/DATASET.csv'
df = pd.read_csv(path)

df.columns = ['Side', 'Received Time', 'API Time', '1st ID', 'Last ID', 'Price', 'Size', 'Style']

#REMOVING API IDs
df = df.drop(['1st ID', 'Last ID'], axis = 1)

df = df.replace(0, np.nan)
df = df.dropna(how = "any", axis=0)

##############################################

#CLEANING DATASET
#SEPARATING DELTAS/SNAPSHOTS and BID/ASK SIDES

#DELTAS AND SNAPSHOTS
df_deltas = df
df_snapshots = df

#Deltas
for index, row in df_deltas.iterrows():
  if(row['Style'] == "SNAPSHOT"):
    df_deltas = df_deltas.drop([index])

#Snapshots
for index, row in df_snapshots.iterrows():
  if(row['Style'] == "DELTA"):
    df_snapshots = df_snapshots.drop([index])

###############################################

#DELTAS ASK/BID
df_ask_deltas = df_deltas
df_bid_deltas = df_deltas

#Ask
for index, row in df_ask_deltas.iterrows():
  if(row['Side'] == "BID"):
    df_ask_deltas = df_ask_deltas.drop([index])

#Bid
for index, row in df_bid_deltas.iterrows():
  if(row['Side'] == "ASK"):
    df_bid_deltas = df_bid_deltas.drop([index])

###############################################

#SNAPSHOTS ASK/BID
df_ask_snapshots = df_snapshots
df_bid_snapshots = df_snapshots

#Ask
for index, row in df_ask_snapshots.iterrows():
  if(row['Side'] == "BID"):
    df_ask_snapshots = df_ask_snapshots.drop([index])

#Bid
for index, row in df_bid_snapshots.iterrows():
  if(row['Side'] == "ASK"):
    df_bid_snapshots = df_bid_snapshots.drop([index])

###############################################

#Plotting Ask/Bid Deltas Together
df_ask_deltas['Price'].plot()
df_bid_deltas['Price'].plot()

In [None]:
m_future = 900

In [None]:
# ASK SIDE

train_dates = df_ask_deltas['Received Time']
cols = list(df_ask_deltas)[3:5] #Columns 3-5 (not including 5) = price, volume
df_for_training_ask = df_ask_deltas[cols].astype(float)

ln_price_quotient_train = []
ln_size_quotient_train = []
ln_price_quotient_test = []
num = 0
denom = 0
num_s = 0
denom_s = 0
count = 0

for ind in df_for_training_ask.index:
  if(ind > 0):
    num = df_for_training_ask['Price'][ind]
    num_s = df_for_training_ask['Size'][ind]
    if(count < len(df_for_training_ask) - m_future):
      ln_price_quotient_train.append(np.log(num/denom))
      ln_size_quotient_train.append(np.log(num_s/denom_s))
    else:
      ln_price_quotient_test.append(np.log(num/denom))
  denom = df_for_training_ask['Price'][ind]
  denom_s = df_for_training_ask['Size'][ind]
  count += 1

df_ask_ln = np.column_stack((ln_price_quotient_train, ln_size_quotient_train))

print(len(df_ask_ln))
print(len(ln_price_quotient_test))

In [None]:
#ASK SIDE
#BUILDING TRAINING SETS
trainX = []
trainY = []

#FORECASTING 1 INSTANCE @ A TIME
#CONSIDERING 14 PREVIOUS INSTANCES IN MODEL
n_future = 1
n_past = 14

for i in range(n_past, len(df_ask_ln) - n_future + 1):
  trainX.append(df_ask_ln[i - n_past:i, 0:df_for_training_ask.shape[1]])
  trainY.append(df_ask_ln[i + n_future - 1:i + n_future, 0])

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

In [None]:
#ASK MODEL
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape = (trainX.shape[1], trainX.shape[2]), return_sequences = True))
model.add(LSTM(32, activation='relu', return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(trainY.shape[1]))

model.compile(optimizer = 'adam', loss = 'mse')
model.summary()

In [None]:
#ASK TRAINING
history = model.fit(trainX, trainY, epochs = 10, batch_size = 16, validation_split = 0.1, verbose = 1)

In [None]:
#ASK LOSS PLOT
plt.plot(history.history['loss'], label = 'Training Loss')
plt.plot(history.history['val_loss'], label = 'Validation Loss')
plt.legend()

In [None]:
# BID SIDE

train_dates_b = df_bid_deltas['Received Time']
cols = list(df_bid_deltas)[3:5] #Columns 3-5 (not including 5) = price, volume
df_for_training_bid = df_bid_deltas[cols].astype(float)

ln_price_quotient_b_train = []
ln_size_quotient_b_train = []
ln_price_quotient_b_test = []
num = 0
denom = 0
num_s = 0
denom_s = 0
count = 0

for ind in df_for_training_bid.index:
  if(ind > 1):
    num = df_for_training_bid['Price'][ind]
    num_s = df_for_training_bid['Size'][ind]
    if(count < len(df_for_training_bid) - m_future):
      ln_price_quotient_b_train.append(np.log(num/denom))
      ln_size_quotient_b_train.append(np.log(num_s/denom_s))
    else:
      ln_price_quotient_b_test.append(np.log(num/denom))
  denom = df_for_training_bid['Price'][ind]
  denom_s = df_for_training_bid['Size'][ind]
  count += 1

df_bid_ln = np.column_stack((ln_price_quotient_b_train, ln_size_quotient_b_train))

print(len(df_bid_ln))
print(len(ln_price_quotient_test))

In [None]:
#BID SIDE
#BUILDING TRAINING SETS
trainX_b = []
trainY_b = []

#FORECASTING 1 INSTANCE @ A TIME
#CONSIDERING 14 PREVIOUS INSTANCES IN MODEL
n_future = 1
n_past = 14

for i in range(n_past, len(df_bid_ln) - n_future + 1):
  trainX_b.append(df_bid_ln[i - n_past:i, 0:df_for_training_bid.shape[1]])
  trainY_b.append(df_bid_ln[i + n_future - 1:i + n_future, 0])

trainX_b, trainY_b = np.array(trainX_b), np.array(trainY_b)

print('trainX shape == {}.'.format(trainX_b.shape))
print('trainY shape == {}.'.format(trainY_b.shape))

In [None]:
#BID MODEL
model_b = Sequential()
model_b.add(LSTM(64, activation='relu', input_shape = (trainX_b.shape[1], trainX_b.shape[2]), return_sequences = True))
model_b.add(LSTM(32, activation='relu', return_sequences = False))
model_b.add(Dropout(0.2))
model_b.add(Dense(trainY_b.shape[1]))

model_b.compile(optimizer = 'adam', loss = 'mse')
model_b.summary()

In [None]:
#BID TRAINING
history_b = model_b.fit(trainX_b, trainY_b, epochs = 10, batch_size = 16, validation_split = 0.1, verbose = 1)

In [None]:
#BID LOSS PLOT
plt.plot(history_b.history['loss'], label = 'Training Loss')
plt.plot(history_b.history['val_loss'], label = 'Validation Loss')
plt.legend()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history_b.history['loss'], label = 'Bid Training Loss')
plt.plot(history_b.history['val_loss'], label = 'Bid Validation Loss')
plt.plot(history.history['loss'], label = 'Ask Training Loss')
plt.plot(history.history['val_loss'], label = 'Ask Validation Loss')
plt.legend()

In [None]:
#ASK FORECAST

forecast_period_dates = pd.date_range(list(train_dates)[-1], periods=m_future, freq='1d').tolist()
forecast = model.predict(trainX[-m_future:])

# print(forecast)

In [None]:
# ASK TICK OUTPUT

forecasted_tick = []

for i in range(len(forecast)):
  if(forecast[i] < 0):
    forecasted_tick.append("D")
  else:
    forecasted_tick.append("U")

print(forecasted_tick)

In [None]:
# BID FORECAST

forecast_period_dates_b = pd.date_range(list(train_dates_b)[-1], periods=m_future, freq='1d').tolist()
forecast_b = model_b.predict(trainX_b[-m_future:])

# print(forecast_b)

In [None]:
forecasted_tick_b = []

for i in range(len(forecast_b)):
  if(forecast_b[i] < 0):
    forecasted_tick_b.append("D")
  else:
    forecasted_tick_b.append("U")

print(forecasted_tick_b)

In [None]:
# ASK TICK VALIDATION

correct = 0
incorrect = 0

for i in range(m_future):
  if((forecast[i] >= 0 and ln_price_quotient_test[i] >= 0)
  or forecast[i] <= 0 and ln_price_quotient_test[i] <= 0):
    correct += 1
  else:
    incorrect += 1

print("Correct: {}".format(correct))
print("Inorrect: {}".format(incorrect))
print("Total: {}".format(m_future))

In [None]:
# BID TICK VALIDATION

correct = 0
incorrect = 0

for i in range(m_future):
  if((forecast_b[i] >= 0 and ln_price_quotient_b_test[i] >= 0)
  or forecast_b[i] <= 0 and ln_price_quotient_b_test[i] <= 0):
    correct += 1
  else:
    incorrect += 1

print("Correct: {}".format(correct))
print("Inorrect: {}".format(incorrect))
print("Total: {}".format(m_future))