In [7]:
# load packages
import pandas as pd
import pickle
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report

import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from utils import *
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33)
# N, D = X_train.shape

In [8]:
data_path = '/home/dkongal/data'
file_li = [file for file in os.listdir(data_path) if file[:3]=='TXF']
file_li.sort()
df_li = [pd.read_csv(os.path.join(data_path, file)) for file in file_li]
col_order = [
'askPrice1', 'askSize1', 'bidPrice1', 'bidSize1',
'askPrice2', 'askSize2', 'bidPrice2', 'bidSize2',
'askPrice3', 'askSize3', 'bidPrice3', 'bidSize3',
'askPrice4', 'askSize4', 'bidPrice4', 'bidSize4',
'askPrice5', 'askSize5', 'bidPrice5', 'bidSize5'
]

raw_data = pd.concat(df_li, axis = 0)
raw_data['date'] = pd.to_datetime(raw_data['date'])
raw_data = raw_data.sort_values(['date', 'time']).drop(['symbol', 'time'], axis=1)
# data = data[col_order].values
# data

# Ensure dates remain in order
data_set = sorted(raw_data['date'].unique())
# Set 'date' as index
raw_data = raw_data.set_index('date')


pred_range = 480
# calculate the midprice
raw_data['midprice'] = (raw_data['askPrice1'] + raw_data['bidPrice1'])/2
raw_data['forward_ma'] = raw_data['midprice'].shift(-pred_range).rolling(window=pred_range).mean() 
raw_data['backbard_ma'] = raw_data['midprice'].rolling(window=pred_range).mean()

# raw_data['chg_pct'] = (raw_data['forward_ma'] - raw_data['backbard_ma'])/raw_data['backbard_ma']
raw_data['chg_pct'] = (raw_data['forward_ma'] - raw_data['midprice'])/raw_data['midprice']

In [9]:
# submission

# use momentum_5 to predict chg_pct, drop ratio is 0.4882
# Hit Rate: 0.5167
# Mean Daily IC: 0.0341
# Standard Deviation of ICs: 0.0397
# Information Ratio (IR): 0.8587
raw_data['ma_5'] = raw_data['midprice'].rolling(window=int(pred_range/10)).mean()
raw_data['momentum_5'] = (raw_data['ma_5'] - raw_data['midprice']) / raw_data['midprice']

# use momentum_10 to predict chg_pct, drop ratio is 0.2571
# Hit Rate: 0.5090
# Mean Daily IC: 0.0198
# Standard Deviation of ICs: 0.0423
# Information Ratio (IR): 0.4689
raw_data['ma_10'] = raw_data['midprice'].rolling(window=int(pred_range/5)).mean()
raw_data['momentum_10'] = (raw_data['ma_10'] - raw_data['midprice']) / raw_data['midprice']

# use momentum_20 to predict chg_pct, drop ratio is 0.1048
# Hit Rate: 0.5033
# Mean Daily IC: 0.0102
# Standard Deviation of ICs: 0.0435
# Information Ratio (IR): 0.2355
raw_data['ma_20'] = raw_data['midprice'].rolling(window=20).mean()
raw_data['momentum_20'] = (raw_data['ma_20'] - raw_data['midprice']) / raw_data['midprice']

# order book size stress
# use size_stress to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.4901
# Mean Daily IC: -0.1073
# Standard Deviation of ICs: 0.0291
# Information Ratio (IR): -3.6926
raw_data['size_stress_pct'] = (raw_data['askSize1'] / raw_data['bidSize1'])

# use size_stress_diff to predict chg_pct, drop ratio is 0.0747
# Hit Rate: 0.4429
# Mean Daily IC: -0.0905
# Standard Deviation of ICs: 0.0270
# Information Ratio (IR): -3.3454
raw_data['size_stress_diff'] = (raw_data['askSize1'] - raw_data['bidSize1'])

# use micro_shift_ratio to predict chg_pct, drop ratio is 0.0747
# Hit Rate: 0.4429
# Mean Daily IC: -0.1094
# Standard Deviation of ICs: 0.0293
# Information Ratio (IR): -3.7411
raw_data['microprice'] = (raw_data['askPrice1'] * raw_data['askSize1'] + raw_data['bidPrice1'] * raw_data['bidSize1'])/(raw_data['askSize1'] + raw_data['bidSize1'])
raw_data['micro_shift_ratio'] = (raw_data['microprice']  - raw_data['midprice'])/raw_data['midprice']

# use more_level_depth to predict chg_pct, drop ratio is 0.0336
# Hit Rate: 0.4596
# Mean Daily IC: -0.0652
# Standard Deviation of ICs: 0.0263
# Information Ratio (IR): -2.4850
raw_data['more_stress_diff'] = raw_data['askSize1'] + raw_data['askSize2']  - raw_data['bidSize1'] - raw_data['bidSize2']

# factor calculation

# use more_level_depth to predict chg_pct, drop ratio is 0.0336
# Hit Rate: 0.4596
# Mean Daily IC: -0.0652
# Standard Deviation of ICs: 0.0263
# Information Ratio (IR): -2.4850
raw_data['more_stress_diff'] = raw_data['askSize1'] + raw_data['askSize2']  - raw_data['bidSize1'] - raw_data['bidSize2']

# use asize1_diff_1_mean_10 to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.4572
# Mean Daily IC: -0.0756
# Standard Deviation of ICs: 0.0200
# Information Ratio (IR): -3.7897
raw_data['asize1_diff_1_mean_10'] = (raw_data['askSize1'].diff(1)).ewm(com=10).mean()

# use asize2_diff_1_mean_10 to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.4755
# Mean Daily IC: -0.0392
# Standard Deviation of ICs: 0.0194
# Information Ratio (IR): -2.0263
raw_data['asize2_diff_1_mean_10'] = (raw_data['askSize2'].diff(1)).ewm(com=10).mean()

# rolling difference
# use asize1_comparative_level to predict chg_pct, drop ratio is 0.0461
# Hit Rate: 0.4582
# Mean Daily IC: -0.0788
# Standard Deviation of ICs: 0.0142
# Information Ratio (IR): -5.5405
raw_data['asize1_comparative_level'] = (raw_data['askSize1'] - raw_data['askSize1'].rolling(window=10).mean())/raw_data['askSize1'].rolling(window=50).std()

# use bsize1_diff_1_mean_10 to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.5241
# Mean Daily IC: 0.0751
# Standard Deviation of ICs: 0.0180
# Information Ratio (IR): 4.1611
raw_data['bsize1_diff_1_mean_10'] = (raw_data['bidSize1'].diff(1)).ewm(com=10).mean()

# use bsize2_diff_1_mean_10 to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.5067
# Mean Daily IC: 0.0387
# Standard Deviation of ICs: 0.0163
# Information Ratio (IR): 2.3742
raw_data['bsize2_diff_1_mean_10'] = (raw_data['bidSize2'].diff(1)).ewm(com=10).mean()

# use bsize1_comparative_level to predict chg_pct, drop ratio is 0.0453
# Hit Rate: 0.5236
# Mean Daily IC: 0.0752
# Standard Deviation of ICs: 0.0156
# Information Ratio (IR): 4.8269
raw_data['bsize1_comparative_level'] = (raw_data['bidSize1'] - raw_data['bidSize1'].rolling(window=10).mean())/raw_data['bidSize1'].rolling(window=50).std()

# combine the size
# use size1_diff to predict chg_pct, drop ratio is 0.0000
# Hit Rate: 0.5353
# Mean Daily IC: 0.0997
# Standard Deviation of ICs: 0.0272
# Information Ratio (IR): 3.6603
raw_data['size1_diff'] = raw_data['bsize1_diff_1_mean_10'] - raw_data['asize1_diff_1_mean_10']

# use size1_comparative_diff to predict chg_pct, drop ratio is 0.0048
# Hit Rate: 0.5342
# Mean Daily IC: 0.1041
# Standard Deviation of ICs: 0.0234
# Information Ratio (IR): 4.4513
raw_data['size1_comparative_diff'] = raw_data['bsize1_comparative_level'] - raw_data['asize1_comparative_level']

# use resilience to predict chg_pct, drop ratio is 0.1048
# Hit Rate: 0.4805
# Mean Daily IC: -0.0255
# Standard Deviation of ICs: 0.0330
# Information Ratio (IR): -0.7728
raw_data['resilience'] = (raw_data['momentum_20']).rolling(window=20).corr(raw_data['size_stress_diff'])  * raw_data['momentum_20']


# raw_data['spread']  = (raw_data['askPrice1'] - raw_data['bidPrice1'])
# raw_data['aprice1_diff_1'] = (raw_data['askPrice1'].diff(1))
# raw_data['asize1_diff_1'] = (raw_data['askSize1'].diff(1))
# raw_data['asize2_diff_1'] = (raw_data['askSize2'].diff(1))
# raw_data['bprice1_diff_1'] = (raw_data['bidPrice1'].diff(1))
# raw_data['bsize1_diff_1'] = (raw_data['bidSize1'].diff(1))

In [12]:
raw_data.columns

Index(['askPrice5', 'askPrice4', 'askPrice3', 'askPrice2', 'askPrice1',
       'bidPrice1', 'bidPrice2', 'bidPrice3', 'bidPrice4', 'bidPrice5',
       'askSize5', 'askSize4', 'askSize3', 'askSize2', 'askSize1', 'bidSize1',
       'bidSize2', 'bidSize3', 'bidSize4', 'bidSize5', 'midprice',
       'forward_ma', 'backbard_ma', 'chg_pct', 'ma_5', 'momentum_5', 'ma_10',
       'momentum_10', 'ma_20', 'momentum_20', 'size_stress_pct',
       'size_stress_diff', 'microprice', 'micro_shift_ratio',
       'more_stress_diff', 'asize1_diff_1_mean_10', 'asize2_diff_1_mean_10',
       'asize1_comparative_level', 'bsize1_diff_1_mean_10',
       'bsize2_diff_1_mean_10', 'bsize1_comparative_level', 'size1_diff',
       'size1_comparative_diff', 'resilience'],
      dtype='object')

In [13]:
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

selected_columns = [ 'momentum_5',  'momentum_10',  'momentum_20', 'micro_shift_ratio', 
       'size_stress_pct','size_stress_diff', 'more_stress_diff',
       'asize1_diff_1_mean_10', 'asize2_diff_1_mean_10', 'asize1_comparative_level', 
       'bsize1_diff_1_mean_10', 'bsize2_diff_1_mean_10', 'bsize1_comparative_level', 
       'size1_diff','size1_comparative_diff',
       'resilience']

target = 'chg_pct'
for factor_col in selected_columns:
    # factor_col = 'micro_shift_ratio'
 

    data = raw_data.dropna(subset= [factor_col, target])
    data = data[data[factor_col]!=0]

    print(f'use {factor_col} to predict {target}, drop ratio is {1 - data.shape[0]/raw_data.shape[0] :.4f}')
    # Step 2: Hit rate
    # Sign agreement
    correct_direction = np.sign(data[factor_col]) == np.sign(data[target])
    hit_rate = correct_direction.mean()
    print(f'Hit Rate: {hit_rate:.4f}')

    # Step 3: Daily IC (Information Coefficient)
    # First, reset index to group by day
    data = data.reset_index()

    # Group by day and compute daily IC using Spearman correlation
    daily_ic_list = []

    for date, group in data.groupby('date'):
        if group[factor_col].nunique() > 1 and group[target].nunique() > 1:
            ic, _ = spearmanr(group[factor_col], group[target])
            daily_ic_list.append(ic)

    # Convert to numpy array for stats
    daily_ic_array = np.array(daily_ic_list)

    # Step 4: Compute IR
    mean_ic = np.nanmean(daily_ic_array)
    std_ic = np.nanstd(daily_ic_array)
    ir = mean_ic / std_ic if std_ic != 0 else np.nan

    print(f'Mean Daily IC: {mean_ic:.4f}')
    print(f'Standard Deviation of ICs: {std_ic:.4f}')
    print(f'Information Ratio (IR): {ir:.4f}')

use momentum_5 to predict chg_pct, drop ratio is 0.0214
Hit Rate: 0.4962
Mean Daily IC: -0.0007
Standard Deviation of ICs: 0.0264
Information Ratio (IR): -0.0266
use momentum_10 to predict chg_pct, drop ratio is 0.0042
Hit Rate: 0.4945
Mean Daily IC: -0.0011
Standard Deviation of ICs: 0.0326
Information Ratio (IR): -0.0340
use momentum_20 to predict chg_pct, drop ratio is 0.1054
Hit Rate: 0.4976
Mean Daily IC: -0.0000
Standard Deviation of ICs: 0.0214
Information Ratio (IR): -0.0007
use micro_shift_ratio to predict chg_pct, drop ratio is 0.0742
Hit Rate: 0.4910
Mean Daily IC: -0.0283
Standard Deviation of ICs: 0.0190
Information Ratio (IR): -1.4853
use size_stress_pct to predict chg_pct, drop ratio is 0.0001
Hit Rate: 0.5000
Mean Daily IC: -0.0268
Standard Deviation of ICs: 0.0185
Information Ratio (IR): -1.4502
use size_stress_diff to predict chg_pct, drop ratio is 0.0742
Hit Rate: 0.4910
Mean Daily IC: -0.0204
Standard Deviation of ICs: 0.0201
Information Ratio (IR): -1.0142
use more

In [14]:
# from scipy.stats import spearmanr
# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np


# selected_columns = [ 'momentum_5',  'momentum_10',  'momentum_20', 'micro_shift_ratio', 
#        'size_stress_pct','size_stress_diff', 'more_stress_diff',
#        'asize1_diff_1_mean_10', 'asize2_diff_1_mean_10', 'asize1_comparative_level', 
#        'bsize1_diff_1_mean_10', 'bsize2_diff_1_mean_10', 'bsize1_comparative_level', 
#        'size1_diff','size1_comparative_diff',
#        'resilience']
# target = 'chg_pct'

# raw_data = raw_data.reset_index()
# # Step 1: Convert 'date' and 'time' to proper datetime
# # Zero-pad the 'time' column to ensure it has 9 digits (HHMMSSmmm)
# raw_data['time_str'] = raw_data['time'].astype(str).str.zfill(9)

# # Combine 'date' and 'time_str' into a single datetime column
# raw_data['datetime'] = pd.to_datetime(
#     raw_data['date'].dt.strftime('%Y-%m-%d') + ' ' +
#     raw_data['time_str'].str.slice(0, 2) + ':' + 
#     raw_data['time_str'].str.slice(2, 4) + ':' + 
#     raw_data['time_str'].str.slice(4, 6) + '.' +
#     raw_data['time_str'].str.slice(6, 9),
#     format='%Y-%m-%d %H:%M:%S.%f'
# )

# # Set datetime as index and sort
# raw_data = raw_data.set_index('datetime').sort_index()
# # ----------- Step 2: Resample to Minute-Level -----------
# # Floor to minute
# raw_data['minute'] = raw_data.index.floor('T')
# # Compute mean of factors per minute
# factor_minute = raw_data.groupby('minute')[selected_columns].mean()
# # Take the last 'chg_pct' value in each minute as label
# label_minute = raw_data.groupby('minute')[target].last()
# # Combine
# minute_data = pd.concat([factor_minute, label_minute], axis=1).dropna()
# # Extract date part for grouping
# minute_data['date'] = minute_data.index.date


# # Step 5: Analysis loop
# for factor_col in selected_columns:
#     data = minute_data.dropna(subset=[factor_col, target])
#     data = data[data[factor_col] != 0]

#     print(f'\nUsing {factor_col} to predict {target}, drop ratio: {1 - data.shape[0]/minute_data.shape[0]:.4f}')

#     # Hit Rate
#     correct_direction = np.sign(data[factor_col]) == np.sign(data[target])
#     hit_rate = correct_direction.mean()
#     print(f'Hit Rate: {hit_rate:.4f}')

#     # Daily IC
#     daily_ic_list = []
#     for date, group in data.groupby('date'):
#         if group[factor_col].nunique() > 1 and group[target].nunique() > 1:
#             ic, _ = spearmanr(group[factor_col], group[target])
#             daily_ic_list.append(ic)

#     daily_ic_array = np.array(daily_ic_list)
#     mean_ic = np.nanmean(daily_ic_array)
#     std_ic = np.nanstd(daily_ic_array)
#     ir = mean_ic / std_ic if std_ic != 0 else np.nan

#     print(f'Mean Daily IC: {mean_ic:.4f}')
#     print(f'Standard Deviation of ICs: {std_ic:.4f}')
#     print(f'Information Ratio (IR): {ir:.4f}')