<b>Agenda</b>
1) Using pipelines
<br>
2) Construct plots
<br>
3) Improve performance
<br>
4) Push code into Github
<br>
<br>
<b>Long-term goals</b>
1) Loading data in batches
<br>
2) Parallel processing (perhaps using Apache Spark)

In [1]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np
import math
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset_df = pd.read_csv('train.csv')

# 1) View summary of the data

In [3]:
print(f'Shape of the training data = ', dataset_df.shape)

Shape of the training data =  (5237980, 17)


In [4]:
print('First 5 rows: -')
dataset_df.head()

First 5 rows: -


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [5]:
print('Description of the data: -')
dataset_df.describe()

Description of the data: -


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id
count,5237980.0,5237980.0,5237980.0,5237760.0,5237980.0,5237760.0,5237760.0,2343638.0,2380800.0,5237760.0,5237980.0,5237760.0,5237980.0,5237760.0,5237892.0,5237980.0
mean,99.28856,241.51,270.0,5715293.0,-0.01189619,0.9999955,45100250.0,1.001713,0.9996601,0.9997263,51813.59,1.000264,53575.68,0.999992,-0.04756125,13310.05
std,57.87176,138.5319,158.7451,20515910.0,0.8853374,0.002532497,139841300.0,0.7214705,0.0121692,0.002499345,111421.4,0.002510042,129355.4,0.002497509,9.45286,7619.271
min,0.0,0.0,0.0,0.0,-1.0,0.935285,4316.61,7.7e-05,0.786988,0.934915,0.0,0.939827,0.0,0.938008,-385.2898,0.0
25%,49.0,122.0,130.0,84534.15,-1.0,0.998763,5279575.0,0.996332,0.9971,0.998529,7374.72,0.999029,7823.7,0.998781,-4.559755,6729.0
50%,99.0,242.0,270.0,1113604.0,0.0,0.999967,12882640.0,0.999883,0.999889,0.999728,21969.0,1.000207,23017.92,0.999997,-0.06020069,13345.0
75%,149.0,361.0,410.0,4190951.0,1.0,1.001174,32700130.0,1.003318,1.00259,1.000905,55831.68,1.001414,57878.41,1.001149,4.409552,19907.0
max,199.0,480.0,540.0,2982028000.0,1.0,1.077488,7713682000.0,437.9531,1.309732,1.077488,30287840.0,1.077836,54405000.0,1.077675,446.0704,26454.0


# 2) Handling missing values

## 2.1) Determine missingness in the dataframe

In [6]:
print("List of missing values for various columns of the dataframe: -")
num_missing_vals_series = (dataset_df.isnull().sum(axis = 0)/dataset_df.shape[0]) * 100
num_missing_vals_series

List of missing values for various columns of the dataframe: -


stock_id                    0.000000
date_id                     0.000000
seconds_in_bucket           0.000000
imbalance_size              0.004200
imbalance_buy_sell_flag     0.000000
reference_price             0.004200
matched_size                0.004200
far_price                  55.256836
near_price                 54.547364
bid_price                   0.004200
bid_size                    0.000000
ask_price                   0.004200
ask_size                    0.000000
wap                         0.004200
target                      0.001680
time_id                     0.000000
row_id                      0.000000
dtype: float64

In [7]:
index = 0

for feature in num_missing_vals_series.index:
    
    if (num_missing_vals_series[feature] > 0) and (num_missing_vals_series[feature] < 1):
        print(f'Feature = {feature}')
        print(f'Percentage of missing values = {num_missing_vals_series[feature]}')
        print(f'Dropping missing values for {feature}')
        dataset_df.dropna(subset = feature, inplace = True)
        print('\n')
    
    index += 1

Feature = imbalance_size
Percentage of missing values = 0.004200092402032845
Dropping missing values for imbalance_size


Feature = reference_price
Percentage of missing values = 0.004200092402032845
Dropping missing values for reference_price


Feature = matched_size
Percentage of missing values = 0.004200092402032845
Dropping missing values for matched_size


Feature = bid_price
Percentage of missing values = 0.004200092402032845
Dropping missing values for bid_price


Feature = ask_price
Percentage of missing values = 0.004200092402032845
Dropping missing values for ask_price


Feature = wap
Percentage of missing values = 0.004200092402032845
Dropping missing values for wap


Feature = target
Percentage of missing values = 0.0016800369608131378
Dropping missing values for target




In [8]:
num_missing_vals_series = (dataset_df.isnull().sum(axis = 0)/dataset_df.shape[0]) * 100
num_missing_vals_series

stock_id                    0.000000
date_id                     0.000000
seconds_in_bucket           0.000000
imbalance_size              0.000000
imbalance_buy_sell_flag     0.000000
reference_price             0.000000
matched_size                0.000000
far_price                  55.254956
near_price                 54.545455
bid_price                   0.000000
bid_size                    0.000000
ask_price                   0.000000
ask_size                    0.000000
wap                         0.000000
target                      0.000000
time_id                     0.000000
row_id                      0.000000
dtype: float64

In [9]:
dataset_df.fillna(0, inplace = True)

In [10]:
num_missing_vals_series = (dataset_df.isnull().sum(axis = 0)/dataset_df.shape[0]) * 100
num_missing_vals_series

stock_id                   0.0
date_id                    0.0
seconds_in_bucket          0.0
imbalance_size             0.0
imbalance_buy_sell_flag    0.0
reference_price            0.0
matched_size               0.0
far_price                  0.0
near_price                 0.0
bid_price                  0.0
bid_size                   0.0
ask_price                  0.0
ask_size                   0.0
wap                        0.0
target                     0.0
time_id                    0.0
row_id                     0.0
dtype: float64

# 3) Reshaping and preparing data

In [11]:
dataset_reshaped_df = dataset_df
dataset_reshaped_df.sort_values(by = ['stock_id', 'date_id', 'seconds_in_bucket'], inplace = True)

In [12]:
def compute_features(df):
    
    df['imbalance_size'] = df['imbalance_size'] * df['imbalance_buy_sell_flag']

def compute_rollover_features(df):
    
    df['prev_target'] = df.groupby('stock_id')['target'].shift(1)
    df.fillna(0, inplace = True)
    
    return df

def standardize_features(df, mu_dict=None, sigma_dict=None):
    
    if (mu_dict is None) and (sigma_dict is None):
        
        mu_dict = {}
        sigma_dict = {}
        
        mu_dict['imbalance_size'] = [0]*200
        mu_dict['reference_price'] = [0]*200
        mu_dict['matched_size'] = [0]*200
        mu_dict['far_price'] = [0]*200
        mu_dict['near_price'] = [0]*200
        mu_dict['bid_price'] = [0]*200
        mu_dict['bid_size'] = [0]*200
        mu_dict['ask_price'] = [0]*200
        mu_dict['ask_size'] = [0]*200
        mu_dict['wap'] = [0]*200
        mu_dict['prev_target'] = [0]*200
        
        sigma_dict['imbalance_size'] = [0]*200
        sigma_dict['reference_price'] = [0]*200
        sigma_dict['matched_size'] = [0]*200
        sigma_dict['far_price'] = [0]*200
        sigma_dict['near_price'] = [0]*200
        sigma_dict['bid_price'] = [0]*200
        sigma_dict['bid_size'] = [0]*200
        sigma_dict['ask_price'] = [0]*200
        sigma_dict['ask_size'] = [0]*200
        sigma_dict['wap'] = [0]*200
        sigma_dict['prev_target'] = [0]*200
        
        for stock_id in range(df['stock_id'].max()+1):
            
            mu_dict['imbalance_size'][stock_id] = df[df['stock_id'] == stock_id]['imbalance_size'].mean()
            mu_dict['reference_price'][stock_id] = df[df['stock_id'] == stock_id]['reference_price'].mean()
            mu_dict['matched_size'][stock_id] = df[df['stock_id'] == stock_id]['matched_size'].mean()
            mu_dict['far_price'][stock_id] = df[df['stock_id'] == stock_id]['far_price'].mean()
            mu_dict['near_price'][stock_id] = df[df['stock_id'] == stock_id]['near_price'].mean()
            mu_dict['bid_price'][stock_id] = df[df['stock_id'] == stock_id]['bid_price'].mean()
            mu_dict['bid_size'][stock_id] = df[df['stock_id'] == stock_id]['bid_size'].mean()
            mu_dict['ask_price'][stock_id] = df[df['stock_id'] == stock_id]['ask_price'].mean()
            mu_dict['ask_size'][stock_id] = df[df['stock_id'] == stock_id]['ask_size'].mean()
            mu_dict['wap'][stock_id] = df[df['stock_id'] == stock_id]['wap'].mean()
            mu_dict['prev_target'][stock_id] = df[df['stock_id'] == stock_id]['prev_target'].mean()
            
            sigma_dict['imbalance_size'][stock_id] = df[df['stock_id'] == stock_id]['imbalance_size'].std()
            sigma_dict['reference_price'][stock_id] = df[df['stock_id'] == stock_id]['reference_price'].std()
            sigma_dict['matched_size'][stock_id] = df[df['stock_id'] == stock_id]['matched_size'].std()
            sigma_dict['far_price'][stock_id] = df[df['stock_id'] == stock_id]['far_price'].std()
            sigma_dict['near_price'][stock_id] = df[df['stock_id'] == stock_id]['near_price'].std()
            sigma_dict['bid_price'][stock_id] = df[df['stock_id'] == stock_id]['bid_price'].std()
            sigma_dict['bid_size'][stock_id] = df[df['stock_id'] == stock_id]['bid_size'].std()
            sigma_dict['ask_price'][stock_id] = df[df['stock_id'] == stock_id]['ask_price'].std()
            sigma_dict['ask_size'][stock_id] = df[df['stock_id'] == stock_id]['ask_size'].std()
            sigma_dict['wap'][stock_id] = df[df['stock_id'] == stock_id]['wap'].std()
            sigma_dict['prev_target'][stock_id] = df[df['stock_id'] == stock_id]['prev_target'].std()
    
    df['imbalance_size'] = (df['imbalance_size'] - df['stock_id'].map(lambda x: mu_dict['imbalance_size'][x]))/df['stock_id'].map(lambda x: sigma_dict['imbalance_size'][x])
    df['reference_price'] = (df['reference_price'] - df['stock_id'].map(lambda x: mu_dict['reference_price'][x]))/df['stock_id'].map(lambda x: sigma_dict['reference_price'][x])
    df['matched_size'] = (df['matched_size'] - df['stock_id'].map(lambda x: mu_dict['matched_size'][x]))/df['stock_id'].map(lambda x: sigma_dict['matched_size'][x])
    df['far_price'] = (df['far_price'] - df['stock_id'].map(lambda x: mu_dict['far_price'][x]))/df['stock_id'].map(lambda x: sigma_dict['far_price'][x])
    df['near_price'] = (df['near_price'] - df['stock_id'].map(lambda x: mu_dict['near_price'][x]))/df['stock_id'].map(lambda x: sigma_dict['near_price'][x])
    df['bid_price'] = (df['bid_price'] - df['stock_id'].map(lambda x: mu_dict['bid_price'][x]))/df['stock_id'].map(lambda x: sigma_dict['bid_price'][x])
    df['bid_size'] = (df['bid_size'] - df['stock_id'].map(lambda x: mu_dict['bid_size'][x]))/df['stock_id'].map(lambda x: sigma_dict['bid_size'][x])
    df['ask_price'] = (df['ask_price'] - df['stock_id'].map(lambda x: mu_dict['ask_price'][x]))/df['stock_id'].map(lambda x: sigma_dict['ask_price'][x])
    df['ask_size'] = (df['ask_size'] - df['stock_id'].map(lambda x: mu_dict['ask_size'][x]))/df['stock_id'].map(lambda x: sigma_dict['ask_size'][x])    
    df['wap'] = (df['wap'] - df['stock_id'].map(lambda x: mu_dict['wap'][x]))/df['stock_id'].map(lambda x: sigma_dict['wap'][x])        
        
    df['prev_target'] = (df['prev_target'] - df['stock_id'].map(lambda x: mu_dict['prev_target'][x]))/df['stock_id'].map(lambda x: sigma_dict['prev_target'][x])
    
    return (mu_dict, sigma_dict)
   
def drop_features(df):
    
    #Drop the following features.
    df.drop(['row_id', 'imbalance_buy_sell_flag'], axis = 1, inplace = True)
    
    if 'currently_scored' in df.columns:
        df.drop(['currently_scored'], axis = 1, inplace = True)

In [13]:
#Recompute some features.
compute_features(dataset_reshaped_df)

#Compute roll-over features.
dataset_reshaped_df = compute_rollover_features(dataset_reshaped_df)

#Standardize features.
mu_dict, sigma_dict = standardize_features(dataset_reshaped_df)

#Drop features from the dataset.
drop_features(dataset_reshaped_df)

In [14]:
dataset_reshaped_df[dataset_reshaped_df['stock_id'] == 0].head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,prev_target
0,0,0,0,0.420313,-0.013518,-0.587909,-0.906572,-0.912704,0.045702,0.326687,0.042624,-0.517844,0.09403,-3.029704,0,0.043116
191,0,0,10,0.218021,0.113973,-0.498553,-0.906572,-0.912704,0.045702,-0.29275,0.042624,-0.257284,0.029652,0.389814,1,-0.457311
382,0,0,20,0.218021,0.050228,-0.498553,-0.906572,-0.912704,0.045702,-0.416637,-0.021117,-0.45475,-0.000153,4.220009,2,0.107503
573,0,0,30,0.218021,0.177718,-0.498553,-0.906572,-0.912704,0.173495,0.264903,0.106365,0.13607,0.144697,5.450249,3,0.740151
764,0,0,40,0.209248,0.36955,-0.494678,-0.906572,-0.912704,0.301885,-0.283994,0.298183,-0.203679,0.282991,3.169775,4,0.943354


In [15]:
train_df = dataset_reshaped_df[dataset_reshaped_df['date_id'] < 478]
val_df = dataset_reshaped_df[dataset_reshaped_df['date_id'] >= 478]

In [16]:
def get_training_data(stock_id, test_val_data_dates, threshold = 0.9):
    
    #Get all training data associated with the stock_id.
    stock_train_df = train_df[train_df['stock_id'] == stock_id].copy()
    
    #Drop columns not relevant for the cosine similarity and convert the result to arrays.
    train_curr_date_array = stock_train_df.drop(['stock_id', 'target', 'date_id', 'time_id', 'seconds_in_bucket'], axis = 1).values
    train_target_date_array = (stock_train_df['target'].values).reshape(-1, 1)
    
    indices_of_closest_arrays = None
    
    for test_val_data_date in test_val_data_dates:
        
        #print(train_curr_date_array.shape, test_val_data_date.shape)
    
        #Compute the cosine similarity between the single row of test data and the all the rows of training data.
        cos_sim = cosine_similarity(train_curr_date_array, test_val_data_date.reshape(1, -1))
        
        if indices_of_closest_arrays is None:
            #Determine the indices of training data with higher cosine similarity than the threshold.
            indices_of_closest_arrays = np.where(cos_sim > threshold)[0]
        
        else:
            indices_of_closest_arrays = np.concatenate((indices_of_closest_arrays, np.where(cos_sim > threshold)[0]))
    
    indices_of_closest_arrays = np.unique(indices_of_closest_arrays)
    indices_of_closest_arrays = np.sort(indices_of_closest_arrays)
    
    #Get training rows corressponding to the indices determined earlier.
    ret_train_arr = train_curr_date_array[indices_of_closest_arrays]
    ret_train_arr = ret_train_arr.reshape(1, ret_train_arr.shape[0], ret_train_arr.shape[1])
    
    #Get target values for the training rows determined earlier.
    ret_train_tar_arr = train_target_date_array[indices_of_closest_arrays]
    ret_train_tar_arr = ret_train_tar_arr.reshape(1, ret_train_tar_arr.shape[0], ret_train_tar_arr.shape[1])
    
    return ret_train_arr, ret_train_tar_arr

#& (val_df['seconds_in_bucket'] == 0)
tr_array = get_training_data(2, val_df[(val_df['stock_id'] == 2) & (val_df['date_id'] == 478)].drop(['stock_id', 'target', 'date_id', 'time_id', 'seconds_in_bucket', 'target'], axis = 1).values, threshold = 0.9)
print(tr_array[0].shape, tr_array[1].shape)

(1, 7746, 11) (1, 7746, 1)


In [17]:
def get_valid_test_data(stock_id, seq_len = 100):
    
    test_val_df = val_df[val_df['stock_id'] == stock_id].copy()
    test_val_df.drop(['stock_id', 'date_id', 'time_id', 'seconds_in_bucket'], inplace = True, axis = 1)
    
    val_target_array = None
    
    if 'target' in test_val_df.columns:
        val_target_array = (test_val_df['target'].values).reshape(-1, 1)
        test_val_df.drop(['target'], inplace = True, axis = 1)
    
    val_array = test_val_df.values
    val_target_array = val_target_array
    
    return val_array, val_target_array

tr_array = get_valid_test_data(0, seq_len = 2000)
print(tr_array[0].shape, tr_array[0].shape)

(165, 11) (165, 11)


# 4) Creating models specific to each stock ID

In [18]:
#Function declarations
def step_decay(epoch, learning_rate):
    # initialize the base initial learning rate, drop factor, and epochs to drop every
    init_lr = 1
    factor = 0.9
    drop_every = 6
    # compute learning rate for the current epoch
    learning_rate = init_lr*(factor ** (np.floor(epoch) / drop_every))
    return learning_rate

In [19]:
#Input dimension - 1 has been added because 'PREV_TARGET field is introduced much later'
INPUT_SIZE = 11
print(f'Input size = {INPUT_SIZE}')

#Declaring a dictionary of models - a model for each stock ID.
models_dict = {}

Input size = 11


In [20]:
def train_models(threshold):
    
    early_checkpoint = EarlyStopping(patience=2, monitor='mae', mode='min')

    seed_value = 42
    tf.random.set_seed(seed_value)

    #Construct train/validation sets separately for all stock IDs.
    for curr_stock_id in dataset_reshaped_df['stock_id'].unique():
        
        print(f'Training model for stock ID {curr_stock_id}')
        
        curr_train_mae = 0
        curr_val_mae = 0
        
        #Get validation data (numpy array) for the current stock_id.
        curr_stock_val_data = get_valid_test_data(curr_stock_id, INPUT_SIZE)
        
        #Construct a GRU model for the current stock ID.
        model = tf.keras.models.Sequential([
            tf.keras.layers.GRU(8, return_sequences=True, input_shape=(None, INPUT_SIZE)),
            tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='linear'))])
        
        #Compile the model.
        lr_scheduler = keras.callbacks.LearningRateScheduler(step_decay)
        opt = tf.keras.optimizers.Adam(learning_rate=0.01)
        model.compile(loss="mean_absolute_error", optimizer=opt, metrics=["mae"])
        
        #Get training data (numpy array) corressponding to the current test row.
        curr_test_row = val_df[val_df['stock_id'] == curr_stock_id].drop(['stock_id', 'target', 'date_id', 'time_id', 'seconds_in_bucket', 'target'], axis = 1).values
        curr_stock_train_data = get_training_data(curr_stock_id, curr_test_row, threshold = threshold)    
        
        if curr_stock_train_data[0].shape[1] == 0:
            curr_stock_train_data = get_training_data(curr_stock_id, curr_test_row, threshold = 0.8)
        elif curr_stock_train_data[0].shape[1] == 0:
            curr_stock_train_data = get_training_data(curr_stock_id, curr_test_row, threshold = 0.7)
        else:
            curr_stock_train_data = get_training_data(curr_stock_id, curr_test_row, threshold = 0.6)
        
        #Create training dataset.
        train_dataset = tf.data.Dataset.from_tensor_slices((curr_stock_train_data[0], curr_stock_train_data[1]))
        train_dataset = train_dataset.batch(500)
        
        #Create validation dataset.
        val_dataset = tf.data.Dataset.from_tensor_slices((curr_stock_val_data[0].reshape(1, 165, 11), curr_stock_val_data[1].reshape(1, 165, 1)))
        val_dataset = val_dataset.batch(500)
        
        #Train the model.
        history = model.fit(train_dataset, validation_data = val_dataset, epochs=20, callbacks=[early_checkpoint, lr_scheduler], verbose = 1)
            
        #Determine the cumulative training and validation MAEs across the different rows of the validation
        #data.
        curr_train_mae = history.history['mae'][-1]
        curr_val_mae = history.history['val_mae'][-1]
        
        print(f'Stock ID = {curr_stock_id}, Threshold = {threshold}, Training MAE = {curr_train_mae}, Validation MAE = {curr_val_mae}')
        
        if curr_stock_id not in models_dict.keys():
            models_dict[curr_stock_id] = [model, threshold, curr_train_mae, curr_val_mae]
        
        else:
            if models_dict[curr_stock_id][3] > curr_val_mae:
                models_dict[curr_stock_id][0] = model
                models_dict[curr_stock_id][1] = threshold   
                models_dict[curr_stock_id][2] = curr_train_mae
                models_dict[curr_stock_id][3] = curr_val_mae   

In [None]:
%%time

#threshold_list = [0.95, 0.9, 0.8, 0.7] 
threshold_list = [0.90]

for threshold in threshold_list:

    print('Cosine similarity threshold = ', threshold)
    train_models(threshold)

Cosine similarity threshold =  0.9
Training model for stock ID 0
Epoch 1/20
Epoch 2/20
Epoch 3/20


In [None]:
print('The MAEs of the models are: -')
count1 = 0
count2 = 0
for curr_stock_id in models_dict.keys():
    print(f'Stock ID = {curr_stock_id}, Cosine similarity threshold = {models_dict[curr_stock_id][1]}, Training MAE = {models_dict[curr_stock_id][2]}, Validation MAE = {models_dict[curr_stock_id][3]}')

    #Check the number of stocks predictions for which have an MAE of less than 5.
    if models_dict[curr_stock_id][3] < 4:
        count1 += 1

    if models_dict[curr_stock_id][3] < 5:
        count2 += 1

In [None]:
print('Number of stocks predictions for which have an MAE of less than 4 = ', count1)
print('Number of stocks predictions for which have an MAE of less than 5 = ', count2)