# ML Models

# 1.- Libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates 
from datetime import datetime, timedelta
import time
import tensorflow as tf
import os
import random
import tensorflow.keras as keras
from tensorflow.keras.callbacks import Callback, TensorBoard, ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Flatten,LeakyReLU, Dense, Dropout, GlobalMaxPooling2D, Activation, Input,LSTM, Reshape, Conv2D, MaxPooling2D,ConvLSTM2D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.compat.v1.keras.backend import set_session
from tensorflow.keras import utils
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from time import time


os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

# 2.- Functions

In [12]:
class My_Custom_Generator(keras.utils.Sequence) :
  
    def __init__(self, train, labels, positives, negatives, model, batch_size) :
        self.train = train
        self.labels = labels
        self.batch_size = batch_size
        self.positives = positives
        self.negatives = negatives
        self.model = model


    def __len__(self) :
        return (np.ceil(len(self.train) / float(self.batch_size))).astype(np.int)


    def __getitem__(self, idx) :

        sequence = random.choices(self.positives, k = int(self.batch_size/2)) + random.choices(self.negatives, k = int(self.batch_size/2))

        batch_x = np.asarray([self.train[i-500:i] for i in sequence])
        temp = self.labels[sequence].astype(int)
        batch_y = np.zeros((temp.size, temp.max()+1))
        batch_y[np.arange(temp.size),temp] = 1
        
        if self.model == "LSTM":
            return np.reshape(batch_x[:,400:500,:,:],(batch_size, 100, 42,7,1)), batch_y,[None]
        else:
            return batch_x, batch_y, [None]

In [13]:
def calculate_file_time(idx):
    if idx < 2:
        month = 5
        day = 31
        hour = 23
    else:
        month = 6
        if idx < 26:
            day = 1
        else:
            day = 2
        hour = (idx - 2) % 24
        
    return str(month), str(day), str(hour)

In [14]:
def read_data(idx):
    
    month, day, hour = calculate_file_time(idx)
    v_month, v_day, v_hour = calculate_file_time(idx+1)
    
    train_input = np.abs(np.load("./01_Data/" + str(idx) + " Input hour 2020_"+month+"_"+day+"_"+hour+"_20.npy").transpose(0, 2, 1))
    train_input[train_input > 1] = 1
    train_input[np.isnan(train_input)] = 0
    train_labels = np.load("./01_Data/" + str(idx) + " Labels.npy")
    
    val_input = np.abs(np.load("./01_Data/" + str(idx + 1) + " Input hour 2020_"+v_month+"_"+v_day+"_"+v_hour+"_20.npy").transpose(0, 2, 1))
    val_input[val_input > 1] = 1
    val_input[np.isnan(val_input)] = 0
    val_labels = np.load("./01_Data/" + str(idx + 1) + " Labels.npy")
    
    return train_input,train_labels,val_input,val_labels

In [15]:
def calculate_callback(idx, train_input, p, n, val_in, val_lab, positives, negatives, folder):
    if idx == 10:
        callbacks = [ModelCheckpoint(filepath='00_Models\\'+folder+'\\'+ str(idx) +'.h5', monitor='val_loss', save_best_only=True),
             TensorBoard(log_dir='.\\02_TB_logs\\'+folder+'\\' + str(idx) + "_" + '{}'.format(time()),
                         histogram_freq = 0,
                         write_graph = True),
#             EarlyStopping(monitor='val_loss', min_delta = 0.0001, patience = 3),
            EarlyStopping(monitor='loss', min_delta = 0.00002, patience = 1)]
    else:
        callbacks = [ModelCheckpoint(filepath='00_Models\\'+folder+'\\'+ str(idx) +'.h5', monitor='val_loss', save_best_only=True),
             TensorBoard(log_dir='.\\02_TB_logs\\'+folder+'\\' + str(idx) + "_" + '{}'.format(time()),
                         histogram_freq = 0,
                         write_graph = False,
                         profile_batch=0),
#              EarlyStopping(monitor='val_loss', min_delta = 0.0001, patience = 3),
             EarlyStopping(monitor='loss', min_delta = 0.00002, patience = 1)]
    return callbacks

# 2.- DeepLOB
https://github.com/zcakhaa/DeepLOB-Deep-Convolutional-Neural-Networks-for-Limit-Order-Books/blob/master/jupyter/run_train_represent.ipynb

In [16]:
def create_deeplob(T, NF, CH, number_of_lstm):
    input_lmd = Input(shape=(T, NF, CH))
    
    # build the convolutional block
    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(32, (1, 10))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(64, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(64, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(64, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = keras.layers.concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)

    # use the MC dropout here
    conv_reshape = Reshape((int(convsecond_output.shape[1]), int(convsecond_output.shape[3])))(convsecond_output)

    # build the last LSTM layer
    conv_lstm = CuDNNLSTM(number_of_lstm)(conv_reshape)

    # build the output layer
    out = Dense(2, activation="softmax")(conv_lstm)
    model = Model(inputs=input_lmd, outputs=out)
    opt = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999)
    model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=['accuracy',
                                                                           tf.keras.metrics.Precision(name = "Precision", class_id=1),
                                                                           tf.keras.metrics.Recall(name = "Recall", class_id=1)])

    return model

In [17]:
model =  create_deeplob(500, 42, 7, 64)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 500, 42, 7)] 0                                            
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 500, 21, 32)  480         input_7[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)       (None, 500, 21, 32)  0           conv2d_6[0][0]                   
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 500, 21, 32)  4128        leaky_re_lu_6[0][0]              
____________________________________________________________________________________________

In [None]:
for i in range(1,44):
    train_input_p,train_labels_p, val_input, val_labels = read_data(i)
    model =  create_deeplob(500, 42, 7, 64)
    if i == 1:
        train_input = train_input_p
        train_labels = train_labels_p
    else:
        train_input = np.append(train_input,train_input_p, axis = 0)
        train_labels = np.append(train_labels,train_labels_p)

    positives = np.where(train_labels[500::] == 1)[0] + 500
    negatives = np.where(train_labels[500::] == 0)[0] + 500
    positives_val = np.where(val_labels[500::] == 1)[0] + 500
    negatives_val = np.where(val_labels[500::] == 0)[0] + 500 
    
    j = 1
    while len(positives_val) < batch_size // 2:
        if i+j>=44:
            print("NOT ENOUGH DATA TO FINISH")
        train_input_p,train_labels_p, val_input_p, val_labels_p = read_data(i + j)
        val_input = np.append(val_input,val_input_p, axis = 0)
        val_labels = np.append(val_labels,val_labels_p)  
        
        positives_val = np.where(val_labels[500::] == 1)[0] + 500
        negatives_val = np.where(val_labels[500::] == 0)[0] + 500   
        
        j += 1

   
    
    callb = calculate_callback(i,train_input, positives, negatives, val_input, val_labels, positives_val, negatives_val, "DLOB2")


    tr_batch_generator  = My_Custom_Generator(train_input, train_labels, positives, negatives, "DLOB2", batch_size)
    val_batch_generator = My_Custom_Generator(val_input, val_labels, positives_val, negatives_val, "DLOB2", batch_size)    
    
    t_steps = min(len(train_input)//batch_size, len(positives)//(batch_size//2),2000)
    t_steps_val = min(max(len(val_input)//batch_size,750), max(len(positives_val)//(batch_size//2),750),750)
    
    print("-----------------------")
    print("MODEL " + str(i))
    print("-----------------------")
    model.fit(tr_batch_generator,
                       steps_per_epoch = t_steps,
                       epochs = 200,
                       verbose = 1,
                       validation_data = val_batch_generator,
                       validation_steps = t_steps_val, 
                       callbacks = callb)

# 3.- VGG16

In [8]:
def VGG(X,Y,CH):
    vg = VGG16(include_top=False, weights='imagenet')
    vg.layers.pop()
    vg.trainable = False
    dense_input = tf.keras.layers.Input(shape=(X, Y, CH))
    dense_filter = tf.keras.layers.Conv2D(5, 3, padding='same')(dense_input)
    dense_filter = keras.layers.LeakyReLU(alpha=0.01)(dense_filter)
    
    dense_filter = tf.keras.layers.Conv2D(3, 3, padding='same')(dense_filter)
    dense_filter = keras.layers.LeakyReLU(alpha=0.01)(dense_filter)    

    baseline = vg(dense_filter)
    
    x = GlobalMaxPooling2D()(baseline)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(2, activation='softmax')(x)

    model = Model(inputs=dense_input, outputs=predictions)

    opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy',
                                                                           tf.keras.metrics.Precision(name = "Precision", class_id=1),
                                                                           tf.keras.metrics.Recall(name = "Recall", class_id=1)])
    
    return model

In [9]:
model = VGG(500, 42, 7)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 500, 42, 7)]      0         
_________________________________________________________________
conv2d (Conv2D)              (None, 500, 42, 5)        320       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 500, 42, 5)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 500, 42, 3)        138       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 500, 42, 3)        0         
_________________________________________________________________
vgg16 (Model)                multiple                  14714688  
_________________________________________________________________
global_max_pooling2d (Global (None, 512)               0     

In [None]:
batch_size = 12
for k in range(1,5):
    for i in range(2,10*k):
        train_input_p,train_labels_p, val_input, val_labels = read_data(i)
        if i == 2:
            train_input = train_input_p
            train_labels = train_labels_p
        else:
            train_input = np.append(train_input,train_input_p, axis = 0)
            train_labels = np.append(train_labels,train_labels_p)
            
    positives = np.where(train_labels[500::] == 1)[0] + 500
    negatives = np.where(train_labels[500::] == 0)[0] + 500
    positives_val = np.where(val_labels[500::] == 1)[0] + 500
    negatives_val = np.where(val_labels[500::] == 0)[0] + 500   
    
    j = 1
    while len(positives_val) < batch_size // 2:
        if i+j>=44:
            print("NOT ENOUGH DATA TO FINISH")
        train_input_p,train_labels_p, val_input_p, val_labels_p = read_data(i + j)
        val_input = np.append(val_input,val_input_p, axis = 0)
        val_labels = np.append(val_labels,val_labels_p)  
        
        positives_val = np.where(val_labels[500::] == 1)[0] + 500
        negatives_val = np.where(val_labels[500::] == 0)[0] + 500   
        
        j += 1    
        
    model = VGG(500, 42, 7)

    callb = calculate_callback(10*k,train_input, positives, negatives, val_input, val_labels, positives_val, negatives_val, "VGG4")
    
    tr_batch_generator  = My_Custom_Generator(train_input, train_labels, positives, negatives, "VGG4", batch_size)
    val_batch_generator = My_Custom_Generator(val_input, val_labels, positives_val, negatives_val, "VGG4", batch_size)    
    
    t_steps = min(len(train_input)//batch_size, len(positives)//(batch_size//2),1000)
    t_steps_val = min(len(val_input)//batch_size, len(positives_val)//(batch_size//2),500)    
    print("----------------------")
    print("MODEL " + str(10*k))
    print("----------------------")
    model.fit(tr_batch_generator,
                       steps_per_epoch = t_steps,
                       epochs = 200,
                       verbose = 1,
                       validation_data = val_batch_generator,
                       validation_steps = t_steps_val,
                       callbacks = callb)    