In [1]:
import pandas as pd

train = pd.read_csv("train.csv")

train = train.drop('row_id', axis=1)

train['is_far_price_missing'] = train['far_price'].isnull().astype(int)
train['is_near_price_missing'] = train['near_price'].isnull().astype(int)
train['is_wap_missing'] = train['wap'].isnull().astype(int)

columns_to_fill = ['imbalance_size', 'reference_price', 'matched_size', 
                   'bid_price', 'ask_price', 'wap','far_price','near_price', 'target']

for column in columns_to_fill:
    train[column].fillna(train[column].mean(), inplace=True)

train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,is_far_price_missing,is_near_price_missing,is_wap_missing
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.004805,0.999735,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,1,1,0
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.004805,0.999735,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,1,1,0
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.004805,0.999735,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,1,1,0
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.004805,0.999735,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,1,1,0
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.004805,0.999735,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,1,1,0


In [2]:
def compute_rsi(data, window=14):
    # Calculate daily price changes
    delta = data.diff()

    # Separate the gains and losses
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    # Calculate the average gains and losses over the specified window
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()

    # Calculate the Relative Strength (RS)
    rs = avg_gain / avg_loss

    # Calculate the Relative Strength Index (RSI)
    rsi = 100 - (100 / (1 + rs))

    return rsi

def calculate_fibonacci_retracement(high, low):
    diff = high - low
    level1 = high - 0.236 * diff
    level2 = high - 0.382 * diff
    level3 = high - 0.618 * diff
    return level1, level2, level3

In [4]:
import numpy as np

train['high'] = train.groupby('stock_id')['wap'].rolling(window=20,min_periods=1).max().reset_index(level=0, drop=True)
train['low'] = train.groupby('stock_id')['wap'].rolling(window=20,min_periods=1).min().reset_index(level=0, drop=True)

#fibonacci
train['fib_level_1'], train['fib_level_2'], train['fib_level_3'] = zip(*train.apply(lambda row: calculate_fibonacci_retracement(row['high'], row['low']), axis=1))
train['support'] = train['low'].rolling(window=20, min_periods=1).min()
train['resistance'] = train['high'].rolling(window=20, min_periods=1).max()

# Feature lists for price and size
price_ftrs = ['reference_price', 'bid_price', 'ask_price', 'wap']
size_ftrs = ['imbalance_size', 'matched_size', 'bid_size', 'ask_size']

# Rolled sum for size features
rolled = train[['stock_id'] + size_ftrs].groupby('stock_id').rolling(window=6, min_periods=1).sum()
rolled = rolled.reset_index(level=0, drop=True)
for col in size_ftrs:
    train[f'{col}_rolled_sum'] = rolled[col]

# Rolled standard deviation for price features
rolled = train[['stock_id'] + price_ftrs].groupby('stock_id').rolling(window=6, min_periods=1).std().fillna(0)
rolled = rolled.reset_index(level=0, drop=True)
for col in price_ftrs:
    train[f'{col}_rolled_std'] = rolled[col]

# Weighted average price (wap)
train['wap'] = (train['bid_price'] * train['ask_size'] + train['ask_price'] * train['bid_size']) / (train['bid_size'] + train['ask_size'])

# Time decayed WAP
train['wap_time_decay'] = train.groupby('stock_id')['wap'].transform(lambda x: x.ewm(halflife=3).mean())

# Moving averages
# 為滾動計算指定 min_periods
train['moving_avg_5'] = train['wap'].rolling(window=5, min_periods=1).mean()

train['moving_avg_10'] = train['wap'].rolling(window=10,min_periods=1).mean()
train['ewma_10'] = train['wap'].ewm(span=10, adjust=False).mean()
train['volatility_10'] = train['wap'].rolling(window=10,min_periods=1).std()
# Different window sizes for rolling averages and adjusted exponential weights
train['moving_avg_20'] = train['wap'].rolling(window=20,min_periods=1).mean()
train['ewma_20'] = train['wap'].ewm(span=20, adjust=False).mean()
# MACD (Moving Average Convergence Divergence)
train['macd'] = train['moving_avg_5'] - train['moving_avg_10']

# Rolling measures
train['rolling_std'] = train['wap'].rolling(window=20,min_periods=1).std()
train['rolling_corr'] = train['wap'].rolling(window=20,min_periods=1).corr(train['moving_avg_10'])
# 使用 fillna 填充 NaN 值
train['rolling_std'] = train['wap'].rolling(window=20, min_periods=1).std().fillna(0)

# Relative Strength Index (RSI)
train['rsi_14'] = compute_rsi(train['wap'])

# Skewness and Kurtosis
train['wap_skewness'] = train.groupby('stock_id')['wap'].rolling(window=20,min_periods=1).skew().reset_index(level=0, drop=True)
train['wap_kurtosis'] = train.groupby('stock_id')['wap'].rolling(window=20,min_periods=1).kurt().reset_index(level=0, drop=True)

# Volatility Clustering
train['volatility_clustering'] = train['volatility_10'] * train['volatility_10'].shift(1)

# Imbalance and liquidity features
train['imb_s1'] = (train['bid_size'] - train['ask_size']) / (train['bid_size'] + train['ask_size'])
train['imb_s2'] = (train['imbalance_size'] - train['matched_size']) / (train['matched_size'] + train['imbalance_size'])
train["volume"] = train["ask_size"] + train["bid_size"]
train["mid_price"] = (train["ask_price"] + train["bid_price"]) / 2
train["liquidity_imbalance"] = (train["bid_size"] - train["ask_size"]) / (train["bid_size"] + train["ask_size"])
train["matched_imbalance"] = (train["imbalance_size"] - train["matched_size"]) / (train["matched_size"] + train["imbalance_size"])
train["all_size"] = train["matched_size"] + train["imbalance_size"]
train["imbalance_size_for_buy_sell"] = train["imbalance_size"] * train["imbalance_buy_sell_flag"]
train["price_spread"] = train["ask_price"] - train["bid_price"]
train['price_pressure'] = train['imbalance_size'] * (train['ask_price'] - train['bid_price'])
train['market_urgency'] = train['price_spread'] * train['liquidity_imbalance']

# Cumulative sum of buy and sell imbalances
train['cumsum_buy_imbalance'] = train[train['imbalance_buy_sell_flag'] == 1]['imbalance_size'].rolling(window=10,min_periods=1).sum().fillna(0)
train['cumsum_sell_imbalance'] = train[train['imbalance_buy_sell_flag'] == -1]['imbalance_size'].rolling(window=10,min_periods=1).sum().fillna(0)

# Order flow imbalance
train['order_flow_imbalance'] = train['bid_size'] - train['ask_size']

# Interaction between volume and price
train['volume_price_interaction'] = train['volume'] * train['wap']

# Lagged features
train['wap_lagged'] = train.groupby('stock_id')['wap'].shift(1)

# Price and volume shocks
train['price_shock'] = (train['wap'] - train['wap'].shift(1)).abs() > train['wap'].rolling(window=20,min_periods=1).std() * 2
train['volume_shock'] = (train['volume'] - train['volume'].shift(1)).abs() > train['volume'].rolling(window=20,min_periods=1).std() * 2

features = [
    'wap', 'volatility_10', 'rolling_corr', 'rsi_14', 'wap_skewness', 
    'wap_kurtosis', 'volatility_clustering', 'imb_s1', 'liquidity_imbalance', 
    'market_urgency', 'cumsum_buy_imbalance', 'cumsum_sell_imbalance', 
    'volume_price_interaction', 'wap_lagged'
]

for feature in features:
    train[feature].fillna(train[feature].mean(), inplace=True)
    
nan_columns = train.columns[train.isna().any()].tolist()
nan_columns
train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,price_spread,price_pressure,market_urgency,cumsum_buy_imbalance,cumsum_sell_imbalance,order_flow_imbalance,volume_price_interaction,wap_lagged,price_shock,volume_shock
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.004805,0.999735,0.999812,...,0.000214,680.648976,0.000161,3180603.0,79193040.0,52158.47,69144.510249,1.000059,False,False
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.004805,0.999735,0.999896,...,0.000764,127.285387,-0.000557,75570980.0,166603.9,-17372.05,23838.120877,1.000059,False,False
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.004805,0.999735,0.999403,...,0.000895,271.077484,0.000298,75570980.0,469483.8,18961.0,56950.970873,1.000059,False,False
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.004805,0.999735,0.999999,...,0.000215,2562.301688,-0.000213,75570980.0,12387170.0,-476707.5,481357.318496,1.000059,True,False
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.004805,0.999735,0.999394,...,0.000622,278.376075,0.00059,75570980.0,12834720.0,16051.44,16919.640704,1.000059,False,True


In [5]:
# Transform date_id into cyclical features
train['date_sin'] = np.sin((train['date_id'] / train['date_id'].max()) * 2 * np.pi)
train['date_cos'] = np.cos((train['date_id'] / train['date_id'].max()) * 2 * np.pi)

# Transform seconds_in_bucket into cyclical features
max_seconds = train['seconds_in_bucket'].max()
train['seconds_sin'] = np.sin((train['seconds_in_bucket'] / max_seconds) * 2 * np.pi)
train['seconds_cos'] = np.cos((train['seconds_in_bucket'] / max_seconds) * 2 * np.pi)

# Drop the original columns if they are no longer needed
train = train.drop(['date_id', 'seconds_in_bucket'], axis=1)

In [6]:
from keras.layers import Input, Embedding, Flatten, Dense, Dropout
from keras.models import Model

max_stock_id = int(train['stock_id'].max()) + 1 
embedding_dim_stock_id = 10  
stock_id_input = Input(shape=(1,), name='stock_id_input')
stock_id_embed = Embedding(max_stock_id, embedding_dim_stock_id, input_length=1)(stock_id_input)
stock_id_embed = Flatten()(stock_id_embed)

hidden_1 = Dense(128, activation='relu')(stock_id_embed)
hidden_1 = Dropout(0.3)(hidden_1)  
hidden_2 = Dense(64, activation='relu')(hidden_1)
hidden_2 = Dropout(0.3)(hidden_2)  
hidden_3 = Dense(32, activation='relu')(hidden_2)
output = Dense(1, activation='linear')(hidden_3)

model = Model(inputs=stock_id_input, outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()




Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 stock_id_input (InputLayer  [(None, 1)]               0         
 )                                                               
                                                                 
 embedding (Embedding)       (None, 1, 10)             2000      
                                                                 
 flatten (Flatten)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 128)               1408      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                          

In [7]:
from sklearn.preprocessing import StandardScaler

features_to_scale = [
    'support','resistance',
    'imbalance_size', 'reference_price', 'matched_size', 'far_price', 
    'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',
    'imbalance_size_rolled_sum', 'matched_size_rolled_sum', 
    'bid_size_rolled_sum', 'ask_size_rolled_sum', 'reference_price_rolled_std',
    'bid_price_rolled_std', 'ask_price_rolled_std', 'wap_rolled_std',
    'wap_time_decay', 'moving_avg_5', 'moving_avg_10', 'ewma_10', 
    'volatility_10', 'moving_avg_20', 'ewma_20', 'macd', 'rolling_std', 
    'rolling_corr', 'rsi_14', 'wap_skewness', 'wap_kurtosis', 
    'volatility_clustering', 'volume', 'mid_price', 'liquidity_imbalance', 
    'matched_imbalance', 'all_size', 'imbalance_size_for_buy_sell', 
    'price_spread', 'price_pressure', 'market_urgency', 'cumsum_buy_imbalance',
    'cumsum_sell_imbalance', 'order_flow_imbalance', 'volume_price_interaction', 
    'wap_lagged', 'price_shock', 'volume_shock'
]

scaler = StandardScaler()

train[features_to_scale] = scaler.fit_transform(train[features_to_scale])

train.head()

Unnamed: 0,stock_id,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,...,cumsum_sell_imbalance,order_flow_imbalance,volume_price_interaction,wap_lagged,price_shock,volume_shock,date_sin,date_cos,seconds_sin,seconds_cos
0,0,-0.121738,1,-0.115157,-0.205215,8.659673e-16,1.709458e-14,-0.001214,0.061469,-0.132149,...,-2.668587e-16,0.306005,-0.188227,-1.029758e-13,-0.430627,-0.422469,0.0,1.0,0.0,1.0
1,1,-0.266311,-1,-0.076658,-0.278663,8.659673e-16,1.709458e-14,0.037813,-0.399984,0.160209,...,-1.415251,-0.076036,-0.39929,-1.029758e-13,-0.430627,-0.422469,0.0,1.0,0.0,1.0
2,2,-0.259774,-1,-0.230194,-0.277554,8.659673e-16,1.709458e-14,-0.19124,-0.120927,-0.006721,...,-1.409827,0.123599,-0.245032,-1.029758e-13,-0.430627,-0.422469,0.0,1.0,0.0,1.0
3,3,0.297354,-1,0.049379,-0.173869,8.659673e-16,1.709458e-14,0.085668,-0.407283,-0.045456,...,-1.196399,-2.599885,1.732091,-1.029758e-13,2.322193,-0.422469,0.0,1.0,0.0,1.0
4,4,-0.252835,-1,-0.243486,-0.17718,8.659673e-16,1.709458e-14,-0.195421,-0.293478,-0.13676,...,-1.188384,0.107612,-0.43152,-1.029758e-13,-0.430627,2.367038,0.0,1.0,0.0,1.0


In [8]:
train.columns

Index(['stock_id', 'imbalance_size', 'imbalance_buy_sell_flag',
       'reference_price', 'matched_size', 'far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target',
       'time_id', 'is_far_price_missing', 'is_near_price_missing',
       'is_wap_missing', 'high', 'low', 'fib_level_1', 'fib_level_2',
       'fib_level_3', 'support', 'resistance', 'imbalance_size_rolled_sum',
       'matched_size_rolled_sum', 'bid_size_rolled_sum', 'ask_size_rolled_sum',
       'reference_price_rolled_std', 'bid_price_rolled_std',
       'ask_price_rolled_std', 'wap_rolled_std', 'wap_time_decay',
       'moving_avg_5', 'moving_avg_10', 'ewma_10', 'volatility_10',
       'moving_avg_20', 'ewma_20', 'macd', 'rolling_std', 'rolling_corr',
       'rsi_14', 'wap_skewness', 'wap_kurtosis', 'volatility_clustering',
       'imb_s1', 'imb_s2', 'volume', 'mid_price', 'liquidity_imbalance',
       'matched_imbalance', 'all_size', 'imbalance_size_for_buy_sell',
       'pric

In [138]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm import tqdm  

X = train.drop('target', axis=1)
y = train['target']

model = RandomForestRegressor(n_estimators=100, max_depth=5, n_jobs=-1)


with tqdm(total=100, desc="Training") as pbar:  
    model.fit(X, y)
    pbar.update(100)  

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

sorted_feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("the most important 20 features：")
for index, row in sorted_feature_importance_df.head(15).iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")

Training: 100%|██████████| 100/100 [06:57<00:00,  4.17s/it]

the most important 15 features：
market_urgency: 0.7511
near_price: 0.0384
price_spread: 0.0157
price_pressure: 0.0133
reference_price: 0.0125
imbalance_size_for_buy_sell: 0.0122
wap_lagged: 0.0114
ask_price: 0.0111
bid_price: 0.0104
wap_time_decay: 0.0091
mid_price: 0.0077
matched_imbalance: 0.0075
imb_s2: 0.0072
rsi_14: 0.0067
wap: 0.0066





In [205]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = train.drop(columns=["target"])  
y = train["target"]  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

n_components = X.shape[1]  
pca = PCA(n_components=n_components)
pca.fit(X_scaled)

explained_variance_ratio = pca.explained_variance_ratio_

feature_names = X.columns 
explained_variance_df = pd.DataFrame({'Feature': feature_names, 'Explained Variance Ratio': explained_variance_ratio})

print(explained_variance_df.head(20))

                    Feature  Explained Variance Ratio
0                  stock_id                  0.199686
1            imbalance_size                  0.078950
2   imbalance_buy_sell_flag                  0.071550
3           reference_price                  0.062168
4              matched_size                  0.052176
5                 far_price                  0.046848
6                near_price                  0.043406
7                 bid_price                  0.034377
8                  bid_size                  0.023874
9                 ask_price                  0.023148
10                 ask_size                  0.022195
11                      wap                  0.020959
12                  time_id                  0.018095
13     is_far_price_missing                  0.017438
14    is_near_price_missing                  0.017139
15           is_wap_missing                  0.016689
16                     high                  0.015644
17                      low 

In [206]:
from sklearn.linear_model import Lasso

X = train.drop('target', axis=1)
y = train['target']

lasso = Lasso(alpha=0.01)
lasso.fit(X, y)

feature_coefficients = {}

for i in range(len(X.columns)):
    if lasso.coef_[i] != 0:
        feature_coefficients[X.columns[i]] = lasso.coef_[i]

sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

print("the most important 20 features：")
for feature, coef in sorted_features[:15]:
    print(f"{feature}: {coef:.4f}")

the most important 20 features：
reference_price: 1.8887
market_urgency: -1.6202
bid_price: -0.9532
ask_price: -0.9010
wap: -0.7858
bid_price_rolled_std: 0.5279
ask_price_rolled_std: -0.4878
wap_time_decay: 0.2803
imbalance_size_for_buy_sell: 0.1954
moving_avg_20: 0.1305
seconds_cos: 0.1301
imbalance_buy_sell_flag: 0.1194
near_price: 0.0919
matched_imbalance: 0.0662
date_cos: -0.0657


In [207]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt

X = train.drop('target', axis=1)
y = train['target']

dtrain = xgb.DMatrix(X, label=y)

params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',           
    'max_depth': 10,                  
    'n_estimators': 4000,             
    'learning_rate': 0.03          
}

model = xgb.train(params, dtrain)

importance = model.get_fscore()
importance_df = pd.DataFrame({'Feature': list(importance.keys()), 'Importance': list(importance.values())})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

top_15_features = importance_df.head(15)
print(top_15_features)

                        Feature  Importance
0                      stock_id       491.0
1                imbalance_size       282.0
3               reference_price       249.0
4                  matched_size       233.0
46                       imb_s2       219.0
53               market_urgency       173.0
17                          low       162.0
45                       imb_s1       162.0
50  imbalance_size_for_buy_sell       157.0
8                      bid_size       152.0
25          bid_size_rolled_sum       151.0
21                      support       139.0
64                  seconds_cos       132.0
27   reference_price_rolled_std       126.0
12                      time_id       125.0


In [221]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

features = [
    'fib_level_1', 'fib_level_2', 'stock_id', 'volatility_10', 'reference_price',
    'market_urgency', 'price_spread', 'price_pressure', 'imbalance_size_for_buy_sell',
    'imbalance_buy_sell_flag', 'imbalance_size_rolled_sum', 'matched_size_rolled_sum',
    'bid_size_rolled_sum', 'ask_size_rolled_sum', 'wap_lagged', 'ask_price', 'bid_price',
    'far_price', 'bid_size', 'ask_size', 'near_price', 'time_id', 'wap_time_decay',
    'mid_price', 'imb_s2', 'imb_s1', 'rsi_14', 'is_far_price_missing', 'is_near_price_missing',
    'is_wap_missing', 'wap', 'seconds_sin', 'seconds_cos', 'moving_avg_20',
    'matched_imbalance', 'date_cos'
]

df = train[features]

def calculate_vif_sorted(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    vif_data = vif_data.sort_values(by="VIF", ascending=False)  # 對VIF值進行降序排序
    return vif_data

vif_df_sorted = calculate_vif_sorted(df)
print(vif_df_sorted)

                        Feature           VIF
23                    mid_price           inf
16                    bid_price           inf
5                market_urgency           inf
6                  price_spread           inf
30                          wap           inf
15                    ask_price           inf
24                       imb_s2  2.875898e+06
34            matched_imbalance  5.201342e+05
1                   fib_level_2  8.742007e+01
0                   fib_level_1  6.934040e+01
4               reference_price  6.928146e+01
28        is_near_price_missing  3.720344e+01
22               wap_time_decay  3.380660e+01
27         is_far_price_missing  3.351423e+01
14                   wap_lagged  2.386516e+01
31                  seconds_sin  4.688452e+00
25                       imb_s1  2.206637e+00
12          bid_size_rolled_sum  1.934492e+00
18                     bid_size  1.865750e+00
10    imbalance_size_rolled_sum  1.770447e+00
13          ask_size_rolled_sum  1

In [223]:
features_to_keep = [
'stock_id', 
'volatility_10', 
'price_spread', 
'price_pressure', 
'imbalance_size_for_buy_sell',
'imbalance_buy_sell_flag', 
'imbalance_size_rolled_sum', 
'matched_size_rolled_sum',
'bid_size_rolled_sum', 
'ask_size_rolled_sum', 
'wap_lagged',
'far_price', 
'bid_size', 
'near_price', 
'time_id', 
'mid_price', 
'rsi_14', 
'is_wap_missing',  
'seconds_cos', 
'moving_avg_20',
'date_cos',
'target'
]

train = train[features_to_keep]
train.head()

Unnamed: 0,stock_id,volatility_10,price_spread,price_pressure,imbalance_size_for_buy_sell,imbalance_buy_sell_flag,imbalance_size_rolled_sum,matched_size_rolled_sum,bid_size_rolled_sum,ask_size_rolled_sum,...,bid_size,near_price,time_id,mid_price,rsi_14,is_wap_missing,seconds_cos,moving_avg_20,date_cos,target
0,0,2.136944e-16,-0.571684,-0.093875,0.157286,1,-0.257681,-0.277605,-0.496977,-0.578446,...,0.061469,1.709458e-14,0,-0.067374,0.0,0,1.0,-0.052494,1.0,-3.029704
1,1,-1.566576,0.535598,-0.124936,0.002441,-1,-0.282653,-0.289969,-0.607903,-0.557256,...,-0.399984,1.709458e-14,0,0.099903,-9.741489,0,1.0,-0.052537,1.0,-5.519986
2,2,-1.566532,0.799333,-0.116865,-0.003863,-1,-0.281524,-0.289782,-0.540822,-0.560073,...,-0.120927,1.709458e-14,0,-0.099291,-9.741489,0,1.0,-0.052589,1.0,-8.38995
3,3,-1.566412,-0.569671,0.011744,-0.541174,-1,-0.185289,-0.272329,-0.609658,0.244787,...,-0.407283,1.709458e-14,0,0.019992,4.071781,0,1.0,-0.052494,1.0,-4.0102
4,4,-1.566397,0.249718,-0.116455,-0.010555,-1,-0.280325,-0.272886,-0.582301,-0.592546,...,-0.293478,1.709458e-14,0,-0.167087,4.094982,0,1.0,-0.052436,1.0,-7.349849


In [164]:
#LSTM
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

time_series_features = train[['stock_id',
'volatility_10',
'reference_price',    
'market_urgency',
'price_spread',
'price_pressure',
'imbalance_size_for_buy_sell',
'imbalance_buy_sell_flag',
'imbalance_size_rolled_sum',
'matched_size_rolled_sum',
'bid_size_rolled_sum',
'ask_size_rolled_sum',
'wap_lagged',
'ask_price',
'bid_price',
'far_price',
'bid_size',
'ask_size',
'near_price',
'time_id',
'wap_time_decay',
'mid_price',
'imb_s2',
'imb_s1',
'rsi_14',
'wap',
'seconds_sin',
'seconds_cos',
'moving_avg_20',
'near_price',
'matched_imbalance',
'date_cos']]

target = train['target']

X_train, X_test, y_train, y_test = train_test_split(time_series_features, target, test_size=0.2, random_state=42)

X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model with early stopping
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Root Mean Squared Error: 8.87521376231561
Mean Absolute Error: 5.796060174346662


In [165]:
#CNN
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.callbacks import EarlyStopping

time_series_features = train[['stock_id',
'volatility_10',
'reference_price',    
'market_urgency',
'price_spread',
'price_pressure',
'imbalance_size_for_buy_sell',
'imbalance_buy_sell_flag',
'imbalance_size_rolled_sum',
'matched_size_rolled_sum',
'bid_size_rolled_sum',
'ask_size_rolled_sum',
'wap_lagged',
'ask_price',
'bid_price',
'far_price',
'bid_size',
'ask_size',
'near_price',
'time_id',
'wap_time_decay',
'mid_price',
'imb_s2',
'imb_s1',
'rsi_14',
'wap',
'seconds_sin',
'seconds_cos',
'moving_avg_20',
'near_price',
'matched_imbalance',
'date_cos']]

target = train['target']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(time_series_features.values)

time_steps = 10
samples = len(train) - time_steps
num_features = len(time_series_features.columns)

X = np.zeros((samples, time_steps, num_features))

for i in range(samples):
    X[i] = scaled_features[i:i+time_steps]

Y = target[time_steps:].values
assert len(Y) == len(X), "The lengths of X and Y do not match."

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(time_steps, len(time_series_features.columns))))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min', restore_best_weights=True)

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, Y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])
model.summary()

test_loss = model.evaluate(X_test, Y_test)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")

mae = mean_absolute_error(Y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: early stopping
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 8, 64)             6592      
                                                                 
 max_pooling1d (MaxPooling1  (None, 4, 64)             0         
 D)                                                              
                                                                 
 flatten_6 (Flatten)         (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 50)                12850     
                                                      

In [210]:
#CNN+LSTM
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
from tensorflow.keras.callbacks import EarlyStopping

time_series_features = train[['stock_id',
'volatility_10',
'reference_price',    
'market_urgency',
'price_spread',
'price_pressure',
'imbalance_size_for_buy_sell',
'imbalance_buy_sell_flag',
'imbalance_size_rolled_sum',
'matched_size_rolled_sum',
'bid_size_rolled_sum',
'ask_size_rolled_sum',
'wap_lagged',
'ask_price',
'bid_price',
'far_price',
'bid_size',
'ask_size',
'near_price',
'time_id',
'wap_time_decay',
'mid_price',
'imb_s2',
'imb_s1',
'rsi_14',
'wap',
'seconds_sin',
'seconds_cos',
'moving_avg_20',
'near_price',
'matched_imbalance',
'date_cos']]

target = train['target']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(time_series_features.values)

time_steps = 10
samples = len(train) - time_steps
num_features = len(time_series_features.columns)
X = np.zeros((samples, time_steps, num_features))

for i in range(samples):
    X[i] = scaled_features[i:i+time_steps]

Y = target[time_steps:].values
assert len(Y) == len(X), "The lengths of X and Y do not match."

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

features = 32

Y_train_seq = Y_train
Y_test_seq = Y_test

assert len(X_train) == len(Y_train_seq), "Mismatch in train sequence lengths."
assert len(X_test) == len(Y_test_seq), "Mismatch in test sequence lengths."

input_layer = Input(shape=(time_steps, features))
lstm_layer = LSTM(50, return_sequences=False)(input_layer)
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(input_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)
cnn_layer = Flatten()(cnn_layer)
combined = Concatenate()([lstm_layer, cnn_layer])
dense_layer = Dense(50, activation='relu')(combined)
output_layer = Dense(1)(dense_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, Y_train_seq, epochs=100, batch_size=32, 
                    validation_data=(X_test, Y_test_seq), 
                    callbacks=[early_stopping],
                    verbose=1)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test_seq, y_pred))
print(f"Root Mean Squared Error: {rmse}")
mae = mean_absolute_error(Y_test_seq, y_pred)
print(f"Mean Absolute Error: {mae}")

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 10, 32)]             0         []                            
                                                                                                  
 conv1d_2 (Conv1D)           (None, 8, 64)                6208      ['input_2[0][0]']             
                                                                                                  
 max_pooling1d_2 (MaxPoolin  (None, 4, 64)                0         ['conv1d_2[0][0]']            
 g1D)                                                                                             
                                                                                                  
 lstm_5 (LSTM)               (None, 50)                   16600     ['input_2[0][0]']      

In [177]:
#LightGBM
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = train.drop('target', axis=1)  
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lgb_params = {
    "objective": "mae",
    "n_estimators": 5000,
    "num_leaves": 256,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "learning_rate": 0.00871,
    "n_jobs": 4,
    "device": "gpu",
    "verbosity": 1,  
    "importance_type": "gain",
}

def early_stopping_callback(env):
    if env.iteration >= 100 and len(env.evaluation_result_list) >= 101 and \
            env.evaluation_result_list[-1][2] <= env.evaluation_result_list[-101][2]:
        raise Exception("Early stopping")

model = lgb.train(lgb_params, train_data, valid_sets=[train_data, test_data], num_boost_round=5000,
                  callbacks=[early_stopping_callback])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R^2):", r2)



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6999
[LightGBM] [Info] Number of data points in the train set: 838860, number of used features: 34
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 33 dense feature groups (28.80 MB) transferred to GPU in 0.017288 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.069737
Root Mean Squared Error: 8.909479414434584
Mean Squared Error (MSE): 68.61060498780951
Mean Absolute Error (MAE): 5.357441633545923
R-squared (R^2): 0.14823609658329695


In [224]:
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X = train.drop('target', axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = cb.CatBoostRegressor(
    iterations=20000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=200, 
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error: {rmse}")
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R^2):", r2)

0:	learn: 8.9294882	test: 8.9623235	best: 8.9623235 (0)	total: 41.5ms	remaining: 13m 49s
100:	learn: 8.6737866	test: 8.7802789	best: 8.7802789 (100)	total: 2.73s	remaining: 8m 58s
200:	learn: 8.5728657	test: 8.7229402	best: 8.7229402 (200)	total: 5.36s	remaining: 8m 47s
300:	learn: 8.4948418	test: 8.6746915	best: 8.6746915 (300)	total: 8.22s	remaining: 8m 58s
400:	learn: 8.4336025	test: 8.6388244	best: 8.6388244 (400)	total: 11s	remaining: 8m 56s
500:	learn: 8.3783744	test: 8.6050211	best: 8.6050211 (500)	total: 13.7s	remaining: 8m 51s
600:	learn: 8.3299354	test: 8.5772253	best: 8.5772253 (600)	total: 16.2s	remaining: 8m 44s
700:	learn: 8.2864969	test: 8.5496713	best: 8.5496713 (700)	total: 18.7s	remaining: 8m 34s
800:	learn: 8.2442455	test: 8.5246467	best: 8.5246467 (800)	total: 21.2s	remaining: 8m 29s
900:	learn: 8.2051541	test: 8.5005905	best: 8.5005905 (900)	total: 23.8s	remaining: 8m 24s
1000:	learn: 8.1662937	test: 8.4773516	best: 8.4773516 (1000)	total: 26.4s	remaining: 8m 21s
1

In [230]:
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

X = train.drop('target', axis=1)
y = train['target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

catboost_reg = CatBoostRegressor(iterations=15000, learning_rate=0.1, depth=3, verbose=100, early_stopping_rounds=100)
catboost_reg.fit(X_train, y_train, eval_set=(X_val, y_val))

lgbm_reg = LGBMRegressor(n_estimators=5000, learning_rate=0.1)
lgbm_reg.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='l1',
)

ensemble_reg = VotingRegressor(estimators=[
    ('catboost', catboost_reg),
    ('lightgbm', lgbm_reg)
])

ensemble_reg.fit(X_train, y_train)
y_pred = ensemble_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print("RMSE:", rmse)
print("MAE:", mae)

0:	learn: 8.9256845	test: 8.9938600	best: 8.9938600 (0)	total: 21.1ms	remaining: 5m 15s
100:	learn: 8.7973636	test: 8.8745814	best: 8.8745814 (100)	total: 1.61s	remaining: 3m 58s
200:	learn: 8.7528215	test: 8.8451261	best: 8.8451261 (200)	total: 3.13s	remaining: 3m 50s
300:	learn: 8.7197124	test: 8.8230832	best: 8.8230832 (300)	total: 4.67s	remaining: 3m 48s
400:	learn: 8.6951105	test: 8.8052595	best: 8.8052595 (400)	total: 6.19s	remaining: 3m 45s
500:	learn: 8.6751524	test: 8.7951475	best: 8.7951475 (500)	total: 7.68s	remaining: 3m 42s
600:	learn: 8.6578284	test: 8.7846800	best: 8.7846800 (600)	total: 9.34s	remaining: 3m 43s
700:	learn: 8.6405027	test: 8.7745495	best: 8.7745495 (700)	total: 10.9s	remaining: 3m 42s
800:	learn: 8.6264825	test: 8.7684566	best: 8.7684566 (800)	total: 12.5s	remaining: 3m 42s
900:	learn: 8.6105975	test: 8.7619951	best: 8.7619951 (900)	total: 14.5s	remaining: 3m 47s
1000:	learn: 8.5967064	test: 8.7521975	best: 8.7521973 (999)	total: 17.4s	remaining: 4m 2s
11