In [1]:
import os, sys, datetime, logging
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.express as px
import datetime
import calendar
from datetime import timedelta
import joblib
import itertools
import warnings

sys.path.append(os.getcwd().split('paper_std')[0])
warnings.filterwarnings('ignore')
if '../' not in sys.path:
    sys.path.append('../')

from util.s3_method import *
from util.load_s3_data import LoadS3Data
from util.time_method import *
from util.plot_method import easy_plot,timeline_sample_kv, get_plot_diff_data
from util.date import tz, pre_quarter_friday, next_quarter_friday
from util.hedge_log import initlog
from util.recover_depth import recoveryDepth
from util.statistic_method_v2 import describe_series
from dateutil.relativedelta import relativedelta
from util.Future_Load import *
from dateutil.relativedelta import relativedelta
from joblib import load
from tqdm import tqdm
from numpy.lib.stride_tricks import as_strided as stride
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [2]:
exchange = 'binance'
symbol = 'btc_usdt'
begin_time = datetime.datetime(2023, 12, 17, 1,tzinfo=TZ_8)
end_time = datetime.datetime(2023, 12, 17, 2,tzinfo=TZ_8)

ticker_data = LoadS3Data.get_cex_ticker(begin_time, end_time, symbol, exchange)
trade_data = LoadS3Data.get_cex_trade(begin_time, end_time, exchange, symbol)
depth_data = LoadS3Data.get_cex_depth_online(begin_time, end_time, exchange, symbol)

In [3]:
import pickle
with open('AllType_Data.pkl', 'wb') as f:
    pickle.dump({'ticker': ticker, 'trade': trade, 'depth': depth},f)

NameError: name 'ticker' is not defined

**Conclusion:**

1. The out-of-sample R-squared for 5-second returns is 10.5%, the direction of the next order is 64% accurate, and the out-of-sample R-squared for predicting the next 10 trade intervals is 9.8%. Important predictors of returns and direction are order book imbalance, recent transaction imbalance,and past trade returns, but the statistic obtained from recent trade volume is more effective for the prediction of duration
2. Transaction data is the most valuable, and tuning the reference and introducing other stocks brings a non-significant improvement in results
3. Predictable span is very short, only predictable before 3min, 2000 transactions, 500K volume

**First: Data Preparation**

The transaction and quote update data are merged based on their timestamps. The best bid and ask information of trades are determined by the most recent quote information as of the time of the transaction.

In [2]:
def get_data(start_date, end_date, exchange, symbol, plot_interval_us):
    trade = FutureData.thread_get(FutureData.get_cex_trade, start_date, end_date, symbol, exchange, plot_interval_us = plot_interval_us)
    ticker = FutureData.thread_get(FutureData.get_cex_ticker, start_date, end_date, symbol, exchange, plot_interval_us = plot_interval_us)
    trade = pd.DataFrame(trade)
    ticker = pd.DataFrame(ticker).rename(columns={'tp':'time','ap':'ask1','aa':'askqty1','bp':'bid1','ba':'bidqty1'})
    trade = trade[['T','p','q','m']].rename(columns = {'p':'price','q':'qty','T':'time','m':'is_buyer_maker'})
    return ticker[['time', 'ask1', 'askqty1', 'bid1', 'bidqty1']], trade

def merge_data(ticker,trade):
    trade = trade.sort_values('time').reset_index(drop = True)
    trade['transaction'] = 1
    trade['volume'] = trade['qty']
    trade[['transaction', 'volume']] = trade[['transaction', 'volume']].cumsum()
    data = pd.concat([ticker, trade], ignore_index = True)
    data = data.sort_values(['time', 'transaction']).reset_index(drop = True)
    del ticker, trade
    data[['ask1', 'bid1', 'price', 'transaction', 'volume', 'is_buyer_maker', 'askqty1', 'bidqty1', 'qty']] = \
        data[['ask1', 'bid1', 'price', 'transaction', 'volume', 'is_buyer_maker', 'askqty1', 'bidqty1', 'qty']].fillna(method = 'ffill')
    data = data.dropna(subset = ['ask1'])
    return data.reset_index(drop = True)


start_date = datetime.datetime(2024,2,5,0,tzinfo=TZ_8)
end_date = datetime.datetime(2024,2,5,1, tzinfo=TZ_8)
exchange, symbol ="binance", "btc_usdt"
plot_interval_us = 100000
ticker,trade = get_data(start_date, end_date, exchange, symbol, plot_interval_us)
# [ticker, trade] = load('data/btc.pkl')
# data = merge_data(ticker, trade)
# data.head()

**Second: Response Variables**

* **Part A:** Time clocks (Achieved)

The author uses three kinds of time clock, which are **calendar clock, transcation clock and volume clock** respectively. With a starting timestamp T, a span ∆ > 0 and a clock mode M ∈ {calendar, transaction, volume}, author deﬁnes the forward looking time interval as:
![Clock Definition](./fig/Jupyterfig/1.png)

* **Part B:** Transaction return (Achieved)

The author defines the average return during one transaction span as:
![Clock Definition](./fig/Jupyterfig/2.png)

* **Part C:** Price direction (Achieved)

In order to figure out whether next price movements will be up or down, author uses average past transaction return of the stock to illustrate:
![Clock Definition](./fig/Jupyterfig/3.png)

* **Part D:** Transaction duration

It may help market makers to make decisions(for example cancel quotes before hit) by measuring the amount of (calendar) time it takes to record transactions. Under M ∈ {transaction, volume}, author defines:
![Clock Definition](./fig/Jupyterfig/4.png)

**Third: Predictor Variables**

As the author didn't analyze problems base on point data, but interval data, he defines lookback spans for three calendar mode:
![Clock Definition](./fig/Jupyterfig/5.png)

* **Part A:** Volume and duration (Achieved)

![Clock Definition](./fig/Jupyterfig/6.png)

* **Part B:** Return and imbalance (Achieved)

![Clock Definition](./fig/Jupyterfig/7.png)

* **Part C:** Speed and cost (Achieved)

![Clock Definition](./fig/Jupyterfig/8.png)

So total varaibles are 117(13 * 9 lookback spans) * 3 (time clocks). Achieved 12 * 9 * 2.

In [3]:
def rolling_factor(df, rely_column, boundary):
    left_bond, right_bond = boundary
    start_index = len(df)-sum(
        df[rely_column]>df[rely_column].values[0]+right_bond)
    column_list = ['breadth', 'immediacy', 'volume_all', 'volume_avg','volume_max',
        'lambda', 'lob_imbalance', 'trn_imbalance', 'past_return',
        'auto_cov', 'quoted_spread', 'effective_spread']
    column_list = [f'{i}_{left_bond}_{right_bond}' for i in column_list] #/1000
    df[column_list] = np.nan
    for i in tqdm(range(start_index, len(df))):
        t0 = df.loc[i, rely_column]
        left_t0 = t0-right_bond
        right_t0 = t0-left_bond
        x = df[(df[rely_column]<=right_t0) & (df[rely_column]>=left_t0)]
        df.loc[i, column_list] = calculate_factor(x, boundary)
    return df

def calculate_factor(x, boundary):
    breadth = len(x)-1
    if breadth > 0:
        immediacy = (boundary[1] - boundary[0])/breadth
        volume_all = np.nansum(x.qty.values[1:])
        volume_avg = volume_all/breadth
        volume_max = np.nanmax(x.qty.values[1:])
        lambda_ = (np.nanmax(x.price.values[1:]) - np.nanmin(x.price.values[1:]))/volume_all
        lob_imbalance = np.nanmean(((x.askqty1-x.bidqty1)/(x.askqty1+x.bidqty1)).values[1:]) if (x.askqty1+x.bidqty1).values[-1] != 0 else 0
        x['direction'] = np.where(x['is_buyer_maker']=='BUY', 1, -1)
        txn_imbalance = np.nansum((x.direction*x.qty).values[1:])/volume_all
        past_return = 1 - np.nanmean(x.price.values[1:])/np.max(x.price.values[1:])
        x['arg_max'] = x['price'].cummax().shift(1)
        x['arg_arg_max'] = x['arg_max'].cummax()
        auto_cov = np.nanmean((np.log(x.price/x['arg_max'])*np.log(x['arg_max']/x['arg_arg_max'])).values[1:])
        quoted_spread = np.nanmean(((x.ask1-x.bid1)/x.price).values[1:])
        effective_spread = np.nansum((np.log(x['arg_max']/x.price)*x.direction*x.qty*x.price).values[1:])/np.nansum((x.qty\
                            *x.price).values[1:])
    else:
        immediacy = volume_all = volume_avg = volume_max = lambda_ = lob_imbalance = txn_imbalance = past_return =\
    auto_cov = quoted_spread = effective_spread = 0
    return breadth, immediacy, volume_all, volume_avg, volume_max, lambda_, lob_imbalance, txn_imbalance, past_return,\
    auto_cov, quoted_spread, effective_spread

def rolling_response(df, rely_column, boundary):
    left_bond, right_bond = boundary
    end_index = len(df)-sum(
        df[rely_column]>df[rely_column].values[-1]-right_bond)
    column_list = ['return', 'direction']
    column_list = [f'{i}_{right_bond}' for i in column_list]
    df[column_list] = np.nan
    for i in tqdm(range(1, end_index)):
        t0 = df.loc[i, rely_column]
        left_t0 = t0+left_bond
        right_t0 = t0+right_bond
        x = df[(df[rely_column]<=right_t0) & (df[rely_column]>=left_t0)]
        return_ = np.nanmean(x.price)
        df.loc[i, column_list[0]] = return_/df.loc[i-1, 'price']
    for i in tqdm(range(1, end_index)):
        t0 = df.loc[i, rely_column]
        left_t0 = t0-right_bond
        right_t0 = t0-left_bond
        x = df[(df[rely_column]<right_t0) & (df[rely_column]>=left_t0)]
        df.loc[i, column_list[1]] = 1 if df.loc[i, column_list[0]] > np.nanmean(x[column_list[0]]) else 0
    return df


In [4]:
time_look_back_window = [[0,1000], [1000,2000], [2000,4000], [4000, 8000], [8000, 16000], [16000, 32000], [32000, 64000], [64000, 128000], [128000, 256000]]
transaction_look_back_window = [[i[0]/1000, i[1]/1000] for i in time_look_back_window]
volume_look_back_window = [[i[0]/10000, i[1]/10000] for i in time_look_back_window]
time_response_window = [[0,5000], [0, 30000]]
transaction_response_window = [[0,10], [0, 200]]
volume_response_window = [[0,0.5], [0, 1]]


# time_data = merge_data(ticker, trade)
# for i in time_response_window:
#     time_data = rolling_response(time_data, 'time', i)
#     time_data.to_pickle('./data/time_data.pkl')
# for i in time_look_back_window:
#     time_data = rolling_factor(time_data, 'time', i)
#     time_data.to_pickle('./data/time_data.pkl')
# print('pickled')

# transaction_data = merge_data(ticker, trade)
# for i in transaction_response_window:
#     transaction_data = rolling_response(transaction_data, 'transaction', i)
#     transaction_data.to_pickle('./data/transaction_data.pkl')
# for i in transaction_look_back_window:
#     transaction_data = rolling_factor(transaction_data, 'transaction', i)
#     transaction_data.to_pickle('./data/transaction_data.pkl')
# print('pickled')

# volume_data = merge_data(ticker, trade)
# for i in volume_response_window:
#     volume_data = rolling_response(volume_data, 'volume', i)
#     volume_data.to_pickle('./data/volume_data.pkl')
# for i in volume_look_back_window:
#     volume_data = rolling_factor(volume_data, 'volume', i)
#     volume_data.to_pickle('./data/volume_data.pkl')
# print('pickled')

**Fourth: Prediction**

* **Part A:** Tuning, traing and testing (Achieved)

The length of each training window is set at 5 trading days. The outer layer of rolling window consists of 40 trading days where the ﬁrst 20 days are used only for tuning hyper-parameters while the next 20 days are used for testing.
![Clock Definition](./fig/Jupyterfig/9.png)

* **Part B:** Prediction results (Partly achieved)
1. Return predict result

Every box plot represents out of sample R suquare:① RF performs better than LASSO; ② prediction becomes worse while horizon grows; ③ TxnImbalance and PastReturn play an important role in prediction 5 seconds return.
![Clock Definition](./fig/Jupyterfig/10.png)
![Clock Definition](./fig/Jupyterfig/11.png)

2. Price direction predict result

① More robust result; ② accuracy 64% for short horizon.
![Clock Definition](./fig/Jupyterfig/12.png)

3. Transaction duration prediction
① Higher accuracy when predicting longer horizon; ② volume related features become more important; ③ more accurate than return.
![Clock Definition](./fig/Jupyterfig/13.png)
![Clock Definition](./fig/Jupyterfig/14.png)

* **Part C:** Prediction consistency over time
1. Results are consistent across the sample;
2. Predictability measures are significantly positive;
3. Volatility increasement(Covid-19) results in a slight decrease in returns and more valatility in duration.
![Clock Definition](./fig/Jupyterfig/15.png)

In [14]:
# prediction evaluation
def prediction_accuracy_rsquare(y, y_hat):
    return 1-np.sum((y-y_hat)**2)/np.sum((y-np.mean(y))**2)

def prediction_accuracy_rmse(y, y_hat):
    return 1-np.mean((y-y_hat)**2)

def prediction_accuracy_direction(y, y_hat):
    return np.mean(np.where(y_hat*y>0, 1, 0))

def tun_train_test(batch, x_columns, y_columns, tun_time = 3):
    '''tun_time = train tun_time times in one tuning_period'''
    tuning_period = len(batch)//2
    tun_len = tuning_period // (tun_time+1)
    tuning_ = [{'train': None, 'test': None} for i in range(tun_time)]

    # split tun data
    tuning_data = batch.iloc[:tuning_period][x_columns+y_columns].values
    window_size = tun_len
    num_windows = tun_time+1
    new_shape = (num_windows, window_size, tuning_data.shape[1])
    new_strides = (tuning_data.strides[0] * window_size, tuning_data.strides[0], tuning_data.strides[1])
    strided_arr = stride(tuning_data, shape=new_shape, strides=new_strides)
    for idx, x in enumerate(strided_arr):
        if idx < tun_time:
            tuning_[idx]['train'] = x
        if idx != 0:
            tuning_[idx-1]['test'] = x
    testing_ =  batch.iloc[tuning_period-tun_len:][x_columns+y_columns].values
    return tuning_, testing_, tun_len+1

def predict_and_evaluate(train, test, apply_func, **kwargs):
    train_x, train_y, test_x, test_y = train[:,:-1], train[:,-1], test[:,:-1], test[:,-1]
    model = apply_func(**kwargs)
    model.fit(train_x, train_y)
    test_y_hat = model.predict(test_x)
    rsquare = prediction_accuracy_rsquare(test_y, test_y_hat)
    rmse = prediction_accuracy_rmse(test_y, test_y_hat)
    direction_rate = prediction_accuracy_direction(test_y, test_y_hat)
    return model, rsquare, rmse, direction_rate

def grid_search(tun_batch, apply_func, grid):
    '''grid = [{'alpha': 0.1}, {'alpha': 0.2}]'''
    best_params = np.nan
    best_evaluation = np.nan
    for params in tqdm(grid):
        evaluation = 0
        for i in range(len(tun_batch)):
            model, rsquare, rmse, direction_rate = predict_and_evaluate(tun_batch[i]['train'], tun_batch[i]['test'], apply_func, **params)
            evaluation += rmse
        evaluation = evaluation/len(tun_batch)
        if ~(best_evaluation > evaluation): 
            best_evaluation = evaluation
            best_params = params
    print(f'After searching, best parameters: {best_params}, MSE: {best_evaluation}')
    return best_params, best_evaluation

def predict_backtest(batch, x_columns, y_columns, apply_func, grid):
    tuning_data, testing_data, train_len = tun_train_test(batch, x_columns, y_columns)
    best_params, best_evaluation = grid_search(tuning_data, apply_func, grid)
    
    params_importance = np.zeros((1, len(x_columns)))
    dim0, dim1 = testing_data.shape
    stride0, stride1 = testing_data.strides
    stride_values = stride(testing_data, (dim0 - (train_len - 1), train_len, dim1), (stride0, stride0, stride1))
    result_values = np.full((dim0, 1), np.nan)
    for idx, values in enumerate(tqdm(stride_values), train_len - 1):
        model = apply_func(**best_params)
        train_x, train_y = values[:-1, :-1], values[:-1, -1]
        model.fit(train_x, train_y)
        test_y_hat = model.predict(values[-2:, :-1])[-1]
        result_values[idx,] = test_y_hat
        try:
            params_importance += (model.coef_!=0).astype(int)
        except:
            params_importance += (model.feature_importances_!=0).astype(int)
    params_importance = params_importance/len(stride_values)
    rsquare = prediction_accuracy_rsquare(testing_data[train_len-1:,-1], result_values[train_len-1:])
    rmse = prediction_accuracy_rmse(testing_data[train_len-1:,-1], result_values[train_len-1:])
    direction_rate = prediction_accuracy_direction(testing_data[train_len-1:,-1], result_values[train_len-1:])
    final_data = pd.DataFrame(testing_data, columns = x_columns+y_columns)
    final_data[f'{y_columns[0]}_predict'] = result_values
    params_importance = pd.DataFrame(params_importance, columns = x_columns, index = ['importance'])
    return params_importance.T, rsquare, rmse, direction_rate

def RF_predict(time_data, transaction_data, volume_data):
    data = {'time': time_data.dropna(axis = 0).reset_index(drop = True)[:len(time_data)//2], 
                 'transaction': transaction_data.dropna(axis = 0).reset_index(drop = True)[:len(transaction_data)//2],
                'volume': volume_data.dropna(axis = 0).reset_index(drop = True)[:len(volume_data)//2]}
    del time_data, transaction_data, volume_data
    lasso_grid = [{'alpha': 10**i, 'fit_intercept':False} for i in range(-4,6)]
    rf_grid = [{'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': i, 'random_state':42} for i in range(3,7)]
    return_result = pd.DataFrame(index = ['RF_MSE', 'RF_Top5ImportantFeature'])
    direction_result = pd.DataFrame(index = ['RF_MSE', 'RF_Top5ImportantFeature'])
    for data_type in ['time', 'transaction', 'volume']:
        x_columns = data[data_type].columns.tolist()[14:]
        for y_ in [10, 12]:
            y_columns = [data[data_type].columns.tolist()[y_]]
            data[data_type][y_columns[0]] = (data[data_type][y_columns[0]]-1)*10000
            print(f'Calculating {y_columns[0]} for {data_type} type data:')
            params_importance, rsquare, rmse, direction_rate = predict_backtest(data[data_type], x_columns, y_columns, RandomForestRegressor, rf_grid)
            return_result.loc['RF_MSE', y_columns[0]] = rmse
            return_result.loc['RF_Top5ImportantFeature', y_columns[0]] = str(params_importance.sort_values('importance').index.to_list()[-5:])
            print(return_result.loc[['RF_MSE', 'RF_Top5ImportantFeature'], y_columns[0]])
        for y__ in [11,13]:
            y_columns = [data[data_type].columns.tolist()[y__]]
            print(f'Calculating {y_columns[0]} for {data_type} type data:')
            params_importance, rsquare, rmse, direction_rate = predict_backtest(data[data_type], x_columns, y_columns, RandomForestRegressor, rf_grid)
            direction_result.loc['RF_MSE', y_columns[0]] = rmse
            direction_result.loc['RF_Top5ImportantFeature', y_columns[0]] = str(params_importance.sort_values('importance').index.to_list()[-5:])
            print(direction_result.loc[['RF_MSE', 'RF_Top5ImportantFeature'], y_columns[0]])
    return return_result, direction_result

def Lasso_predict(time_data, transaction_data, volume_data):
    data = {'time': time_data.dropna(axis = 0).reset_index(drop = True)[:len(time_data)//2], 
                 'transaction': transaction_data.dropna(axis = 0).reset_index(drop = True)[:len(transaction_data)//2],
                'volume': volume_data.dropna(axis = 0).reset_index(drop = True)[:len(volume_data)//2]}
    del time_data, transaction_data, volume_data
    lasso_grid = [{'alpha': 10**i, 'fit_intercept':False} for i in range(-4,6)]
    rf_grid = [{'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': i, 'random_state':42} for i in range(3,7)]
    return_result = pd.DataFrame(index = ['Lasso_MSE', 'Lasso_Top5ImportantFeature'])
    direction_result = pd.DataFrame(index = ['Lasso_MSE', 'Lasso_Top5ImportantFeature'])
    for data_type in ['time', 'transaction', 'volume']:
        x_columns = data[data_type].columns.tolist()[14:]
        for y_ in [10, 12]:
            y_columns = [data[data_type].columns.tolist()[y_]]
            data[data_type][y_columns[0]] = (data[data_type][y_columns[0]]-1)*10000
            print(f'Calculating {y_columns[0]} for {data_type} type data:')
            params_importance, rsquare, rmse, direction_rate = predict_backtest(data[data_type], x_columns, y_columns, Lasso, lasso_grid)
            return_result.loc['Lasso_MSE', y_columns[0]] = rmse
            return_result.loc['Lasso_Top5ImportantFeature', y_columns[0]] = str(params_importance.sort_values('importance').index.to_list()[-5:])
            print(return_result.loc[['Lasso_MSE', 'Lasso_Top5ImportantFeature'], y_columns[0]])       
        for y__ in [11,13]:
            y_columns = [data[data_type].columns.tolist()[y__]]
            print(f'Calculating {y_columns[0]} for {data_type} type data:')
            params_importance, rsquare, rmse, direction_rate = predict_backtest(data[data_type], x_columns, y_columns, Lasso, lasso_grid)
            direction_result.loc['Lasso_MSE', y_columns[0]] = rmse
            direction_result.loc['Lasso_Top5ImportantFeature', y_columns[0]] = str(params_importance.sort_values('importance').index.to_list()[-5:])
            print(direction_result.loc[['Lasso_MSE', 'Lasso_Top5ImportantFeature'], y_columns[0]])
    return return_result, direction_result
    

In [None]:
time_data = pd.read_pickle('./data/time_data.pkl')
transaction_data = pd.read_pickle('./data/transaction_data.pkl')
volume_data = pd.read_pickle('./data/volume_data.pkl')
# ls_return_result, ls_direction_result = Lasso_predict(time_data, transaction_data, volume_data)
rf_return_result, rf_direction_result = RF_predict(time_data, transaction_data, volume_data)

Calculating return_5 for time type data:


100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.7005015436806293


100%|██████████| 9474/9474 [39:00<00:00,  4.05it/s]


RF_MSE                                                              0.692368
RF_Top5ImportantFeature    ['volume_avg_0.0_1.0', 'lob_imbalance_64.0_128...
Name: return_5, dtype: object
Calculating return_30 for time type data:


100%|██████████| 4/4 [00:04<00:00,  1.03s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.11046864825747556


100%|██████████| 9474/9474 [38:02<00:00,  4.15it/s]


RF_MSE                                                              -0.14775
RF_Top5ImportantFeature    ['lambda_128.0_256.0', 'past_return_64.0_128.0...
Name: return_30, dtype: object
Calculating direction_5 for time type data:


100%|██████████| 10/10 [00:01<00:00,  5.84it/s]


After searching, best parameters: {'alpha': 10, 'fit_intercept': False}, MSE: 0.7378163916888809


100%|██████████| 9474/9474 [01:11<00:00, 132.23it/s]
100%|██████████| 4/4 [00:04<00:00,  1.03s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.7306852408043548


100%|██████████| 9474/9474 [38:37<00:00,  4.09it/s]


RF_MSE                                                              0.698033
RF_Top5ImportantFeature    ['volume_avg_0.0_1.0', 'lob_imbalance_32.0_64....
Name: direction_5, dtype: object
Calculating direction_30 for time type data:


100%|██████████| 10/10 [00:01<00:00,  5.61it/s]


After searching, best parameters: {'alpha': 100, 'fit_intercept': False}, MSE: 0.7416326641962593


100%|██████████| 9474/9474 [00:23<00:00, 396.78it/s]
100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 6, 'random_state': 42}, MSE: 0.6866452085199946


100%|██████████| 9474/9474 [57:54<00:00,  2.73it/s]  


RF_MSE                                                              0.544227
RF_Top5ImportantFeature    ['immediacy_0.0_1.0', 'breadth_0.0_1.0', 'volu...
Name: direction_30, dtype: object
Calculating return_10 for transaction type data:


100%|██████████| 4/4 [00:04<00:00,  1.22s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.9918138153135687


100%|██████████| 9474/9474 [45:37<00:00,  3.46it/s]


RF_MSE                                                              0.991834
RF_Top5ImportantFeature    ['effective_spread_0.002_0.004', 'lob_imbalanc...
Name: return_10, dtype: object
Calculating return_200 for transaction type data:


100%|██████████| 4/4 [00:04<00:00,  1.20s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 4, 'random_state': 42}, MSE: 0.36066737147536604


100%|██████████| 9474/9474 [57:48<00:00,  2.73it/s]


RF_MSE                                                              0.457019
RF_Top5ImportantFeature    ['immediacy_0.128_0.256', 'breadth_0.128_0.256...
Name: return_200, dtype: object
Calculating direction_10 for transaction type data:


100%|██████████| 10/10 [00:01<00:00,  8.82it/s]


After searching, best parameters: {'alpha': 0.1, 'fit_intercept': False}, MSE: 0.7624407951742039


100%|██████████| 9474/9474 [01:50<00:00, 86.07it/s] 
100%|██████████| 4/4 [00:04<00:00,  1.15s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 6, 'random_state': 42}, MSE: 0.8250658977330505


100%|██████████| 9474/9474 [1:21:17<00:00,  1.94it/s]


RF_MSE                                                              0.638578
RF_Top5ImportantFeature    ['immediacy_0.0_0.001', 'trn_imbalance_0.0_0.0...
Name: direction_10, dtype: object
Calculating direction_200 for transaction type data:


100%|██████████| 10/10 [00:01<00:00,  8.40it/s]


After searching, best parameters: {'alpha': 10, 'fit_intercept': False}, MSE: 0.744330810815314


100%|██████████| 9474/9474 [00:43<00:00, 218.41it/s]
100%|██████████| 4/4 [00:04<00:00,  1.09s/it]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 5, 'random_state': 42}, MSE: 0.6751945419709529


100%|██████████| 9474/9474 [1:02:30<00:00,  2.53it/s]


RF_MSE                                                              0.558704
RF_Top5ImportantFeature    ['lambda_0.128_0.256', 'trn_imbalance_0.064_0....
Name: direction_200, dtype: object
Calculating return_0.5 for volume type data:


100%|██████████| 4/4 [00:03<00:00,  1.20it/s]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.9675975166839726


100%|██████████| 9474/9474 [30:15<00:00,  5.22it/s]


RF_MSE                                                               0.96593
RF_Top5ImportantFeature    ['volume_all_0.0_0.1', 'lambda_0.0_0.1', 'effe...
Name: return_0.5, dtype: object
Calculating return_1 for volume type data:


100%|██████████| 4/4 [00:03<00:00,  1.20it/s]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.935797969198371


100%|██████████| 9474/9474 [29:56<00:00,  5.27it/s]


RF_MSE                                                              0.928723
RF_Top5ImportantFeature    ['past_return_0.0_0.1', 'lob_imbalance_0.0_0.1...
Name: return_1, dtype: object
Calculating direction_0.5 for volume type data:


100%|██████████| 10/10 [00:01<00:00,  7.93it/s]


After searching, best parameters: {'alpha': 100, 'fit_intercept': False}, MSE: 0.5951768634765264


100%|██████████| 9474/9474 [00:27<00:00, 342.42it/s]
100%|██████████| 4/4 [00:03<00:00,  1.29it/s]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 3, 'random_state': 42}, MSE: 0.7306950718092158


100%|██████████| 9474/9474 [29:16<00:00,  5.39it/s]


RF_MSE                                                              0.708407
RF_Top5ImportantFeature    ['lambda_0.0_0.1', 'lob_imbalance_1.6_3.2', 'v...
Name: direction_0.5, dtype: object
Calculating direction_1 for volume type data:


100%|██████████| 10/10 [00:01<00:00,  8.73it/s]


After searching, best parameters: {'alpha': 100, 'fit_intercept': False}, MSE: 0.5827063860698865


100%|██████████| 9474/9474 [00:30<00:00, 315.60it/s]
100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


After searching, best parameters: {'n_estimators': 10, 'criterion': 'squared_error', 'max_depth': 5, 'random_state': 42}, MSE: 0.7039742364536754


 71%|███████   | 6720/9474 [30:18<11:50,  3.87it/s]

In [16]:
# ls_result = pd.concat([ls_return_result.T,ls_direction_result.T])
# ls_result.to_csv('./data/ls_result.csv')
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
ls_result = pd.read_csv('./data/ls_result.csv')
ls_result.head()

Unnamed: 0,Lasso_MSE,Lasso_Top5ImportantFeature
return_5,0.835151,"['volume_avg_1.0_2.0', 'auto_cov_128.0_256.0',..."
return_30,0.359547,"['volume_avg_1.0_2.0', 'auto_cov_128.0_256.0',..."
return_10,0.993207,"['volume_avg_0.001_0.002', 'auto_cov_0.128_0.2..."
return_200,0.691657,"['volume_avg_0.001_0.002', 'auto_cov_0.128_0.2..."
return_0.5,0.978034,"['volume_avg_0.1_0.2', 'auto_cov_12.8_25.6', '..."


In [None]:
rf_result = pd.concat([rf_return_result.T,rf_direction_result.T])
rf_result.to_csv('./data/rf_result.csv')
rf_result.head()

**Fifth: Cross-Sectional and Time-Series Determinants of Predictability**

* **Part A:** Nominal Share Price Level and Price Discreteness: Lower price, Higher predict precision.
* **Part B:** Stock Trading Liquidity: Higher liquidity(Lower spread or higher volume), Higher predict precision in duration, but Lower predict precision in return and direction.
* **Part C:** Stock-Level Volatility and Jumps: Volatility(std for 15s mid price return) and Jumps(whether daily return falls into the range of 3−4%, 4−5% or greater than 5%) contribute to duration prediction, distribute to return prediction.
* **Part D:** Asset Pricing Characteristics: betas, daily R square and daily idiosyncratic volatilitie influence on prediction

**Sixth: The Value of a Millisecond**

* **Part A:** The Predictability Lifespan: Only predictable before 3min, 2000 transactions, 500K volume.
* **Part B:** The Impact of Delays in Acquiring or Processing Data: import delay to illustrate predictability decrease as data delays

**Seventh: Robustness Checks**

* **Part A:** Comparison and Consistency of Results Across Prediction Methods
* **Part B:** Fine-tuning the Number of Trees in a Random Forest
* **Part C:** Predictability Using Only Subtypes of Data: Trades vs. Quotes, transaction-only timestamp data performces well than quote-update only timestamp, for less noise in transaction data
* **Part D:** Incremental Predictability Using Additional Data From Correlated Stocks: Additional  predictability  can  be  achieved  by  adding  data  derived  from  other  stocks