In [1]:
# For data processing
import numpy as np
import pandas as pd

# Load pickle file
import pickle

# LSTM and other layers
import tensorflow as tf

In [2]:
with open('../datasets/datasets-list', 'rb') as file:
    datasets = pickle.load(file)

### Feature generation for LSTM

We input the model with 240 timesteps and 3 features and train it to predict the direction of the $241^{st}$ intraday return.

More precisely, for each stock $s$ at time $t$, we first consider the following three features $ir^{(s)}_{t, 1}, cr^{(s)}_{t, 1}, or^{(s)}_{t, 1}$ defined above.

Then we apply the Robust Scaler Standardization

$\tilde f^{(s)}_{t, 1} := \dfrac {f^{(s)}_{t,1} - Q_2(f^{(s)}_{.,1})} {Q_3(f^{(s)}_{.,1}) - Q_1(f^{(s)}_{.,1})}$

where $Q_1(f^{(s)}_{.,1}), Q_2(f^{(s)}_{.,1})$ and $Q_3(f^{(s)}_{.,1})$ are the first, second and third quartile of $f^{(s)}_{.,1}$, for each feature $f^{(s)}_{.,1} \in \{ ir^{(s)}_{., 1}, cr^{(s)}_{., 1}, or^{(s)}_{., 1} \}$ in the respective training period.

The Robust Scaler Standardization first subtracts (and hence removes) the median and then scales the data using the inter-quartile range, making it robust to outliers.

Next for each time $t \in \{ 240, 241, ..., T_{study} \}$, we generate overlapping sequence of 240 consecutive, three-dimensional standardized features $\{ \tilde F^{(s)}_{t-239,1}, \tilde F^{(s)}_{t-238,1}, ..., \tilde F^{(s)}_{t,1} \}$, where $\tilde F^{(s)}_{t-i,1} := (\tilde ir^{(s)}_{t-i,1}, \tilde cr^{(s)}_{t-i,1}, \tilde or^{(s)}_{t-i,1}), i \in \{ 239, 238, ..., 0 \}$.

In [3]:
def calc_ir_cr_or(curr_dataset):
    t_study = curr_dataset.shape[1]
    time = np.arange(0, t_study) # time starts from 0 to T_study-1
    
    # Seperate the cp and op for each stock price
    cp_S = curr_dataset[:, :, 0]
    op_S = curr_dataset[:, :, 1]
    
    # time for calculation of ir starts from time=1, so time[1:]
    cp_t_1 = cp_S[:, time[1:]-1]
    op_t_1 = op_S[:, time[1:]-1]
    # Create a container for ir with the same size as cp_S or op_S
    ir_t_1 = np.zeros(shape=cp_S.shape)
    # Fill the values as nan as the first value will be NaN except all others because for t=0, m=1 is negative
    ir_t_1[:, :] = np.nan
    # Divide cp_t_1 by op_t_1, use np.divide to avoid DivideByZero exception. First ir is at time=1 index.
    ir_t_1[:, time[1]:] = np.divide(cp_t_1, op_t_1, out=np.zeros_like(cp_t_1), where=op_t_1!=0) - 1
    
    # time for calculation of cr starts from time=2, so time[2:]
    cp_t_1 = cp_S[:, time[2:]-1]
    cp_t_2 = cp_S[:, time[2:]-2]
    # Create a container for cr with the same size as cp_S or op_S
    cr_t_1 = np.zeros(shape=cp_S.shape)
    #Fill all values with NaN
    cr_t_1[:, :] = np.nan
    # Divide cp_t_1 by cp_t_1_1 or cp_t_2, use np.divide to avoud DivideByZero exception. First cr is at time=2 index.
    cr_t_1[:, time[2]:] = np.divide(cp_t_1, cp_t_2, out=np.zeros_like(cp_t_1), where=cp_t_2!=0) - 1
    
    # time for calculating or starts from time=1, so time[1:]
    cp_t_1 = cp_S[:, time[1:]-1]
    op_t = op_S[:, time[1:]]
    # Create a container for cr with the same size as cp_S or op_S
    or_t_1 = np.zeros(shape=cp_S.shape)
    #Fill all values with NaN
    or_t_1[:, :] = np.nan
    # Divide cp_t_1 by cp_t_1_1 or cp_t_2, use np.divide to avoud DivideByZero exception. First cr is at time=2 index.
    or_t_1[:, time[1]:] = np.divide(op_t, cp_t_1, out=np.zeros_like(op_t), where=cp_t_1!=0) - 1
    
    
    
    return [ir_t_1, cr_t_1, or_t_1]

In [4]:
def robust_scaler(fs):
    # Create a dummy with the original size
    dummy = np.zeros_like(fs)
    dummy[:, :] = np.nan
    
    fs = fs[:, 1:] # Remove the 0th day as it contains NaN
    
    Q1 = np.percentile(fs, 25, axis=1)
    Q2 = np.percentile(fs, 50, axis=1)
    Q3 = np.percentile(fs, 75, axis=1)
    
    scaled = (fs - Q2.reshape(-1, 1))
    inter_qrtl_range = (Q3.reshape(-1, 1) - Q1.reshape(-1, 1))
    
    scaled_fs = np.divide(scaled, inter_qrtl_range, out=np.zeros_like(scaled), where = inter_qrtl_range!=0)
    
    # Store the scaled values inside non-nan positions
    dummy[:, 1:] = scaled_fs
    
    return dummy

In [5]:
def generate_features_lstm(dataset):
    n_stocks = dataset.shape[0]
    
    # ir, cr and or will be returned after calling calc_ir_cr_or function
    three_features = calc_ir_cr_or(dataset)
    
    ir_t1 = three_features[0]
    cr_t1 = three_features[1]
    or_t1 = three_features[2]
    
    scaled_ir = robust_scaler(ir_t1)
    scaled_cr = robust_scaler(cr_t1)
    scaled_or = robust_scaler(or_t1)
    
    T_study = dataset.shape[1]
    T = np.arange(241, T_study) # t in {241, 242, ...., T_study}
    I = np.arange(240, -1, -1) # i in {240, 239, ...., 0}
    
    # F_scaled = {scaled_ir, scaled_cr, scaled_or}; F_scaled_(t,1) is the target variable
    # Data: F_scaled_(1, 1), F_scaled_(2, 1), F_scaled_(3, 1), ...., F_scaled_(240, 1), F_scaled_(241, 1)
    # Data: F_scaled_(2, 1), F_scaled_(3, 1), F_scaled_(4, 1), ...., F_scaled_(241, 1), F_scaled_(242, 1)
    # Data: F_scaled_(3, 1), F_scaled_(4, 1), F_scaled_(5, 1), ...., F_scaled_(242, 1), F_scaled_(243, 1)
    # ....
    # ....
    # ....
    # Data: F_scaled_(t-i, 1) for i in {240, 239, ..., 0} Target: F_scaled_(t, 1)
    
    features = []
    for t in T:
        scaled_ir_ti = scaled_ir[:, t - I]
        scaled_cr_ti = scaled_cr[:, t - I]
        scaled_or_ti = scaled_or[:, t - I]
        
        F_scaled = np.array([scaled_ir_ti, scaled_cr_ti, scaled_or_ti]).reshape(n_stocks, -1, 3)
        features.append(F_scaled)
        
    return features

In [None]:
feature_container = []
for dataset in datasets:
    features_container.append(np.array(generate_features_lstm(dataset))) 
    
# MEMORY PROBLEM!!!