In [11]:
import os
import itertools
from pathlib import Path
from typing import List, Tuple
from joblib import Parallel, delayed


import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer


In [4]:
import torch
torch.cuda.is_available()

True

## Feature Engineering

In [8]:
DATA_DIR = Path(os.getcwd()).parent.joinpath("data")

RVOL_GRID = {
    "wap2_wt": [0, 0.15],
    "ewa_alpha": [0.2, 0.6, 1],
    "shift_size": [1, 3, 5] 
}

In [12]:
def wap_ewa(df, wap2_wt, ewa_alpha):
    df['wap1'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['wap2'] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    df[f'wap_{wap2_wt}_{ewa_alpha}'] = (df['wap1'] * (1 - wap2_wt) + df['wap2'] * wap2_wt).ewm(alpha=ewa_alpha).mean()
    return df

def spread_ratio(df):
    df['spread_ratio'] = ((df['ask_price1'] / df['bid_price1'] - 1) * 10000)
    return df

def volume_imbalance(df):
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    return df

def calc_rvol(x: pd.Series, shift_size: int):
    x = np.log(x).diff(shift_size)
    return np.sqrt(np.sum(x**2))

def create_rvol_calc(shift_size):
    return lambda x: calc_rvol(x, shift_size)

def engineer_book_features(stock_id, train_test):
    file_path = load_data(stock_id, "book", train_test)
    df_book = pd.read_parquet(file_path)

    for wap2_wt, ewa_alpha in itertools.product(*list(RVOL_GRID.values())[:2]):
        df_book = wap_ewa(df_book, wap2_wt, ewa_alpha)

    df_book = spread_ratio(df_book)
    df_book = volume_imbalance(df_book)
    
    agg_dict = {
        'spread_ratio_mean': ('spread_ratio', 'mean'),
        'spread_ratio_std': ('spread_ratio', 'std'),
        'volume_imbalance': ('volume_imbalance', 'mean')
    }

    for wap2_wt, ewa_alpha, shift_size in itertools.product(*RVOL_GRID.values()):
        agg_dict[f'rvol_{wap2_wt}_{ewa_alpha}_{shift_size}'] = (f"wap_{wap2_wt}_{ewa_alpha}", create_rvol_calc(shift_size))

    df_book_features = df_book.groupby('time_id').agg(**agg_dict).reset_index()
    df_book_features['stock_id'] = stock_id
    
    return df_book_features

def engineer_trade_features(stock_id, train_test):
    file_path = load_data(stock_id, "trade", train_test)
    df_trade = pd.read_parquet(file_path)

    df_trade_features = df_trade.groupby('time_id').agg(
        trade_volume = ('size', 'sum'),
        trade_count = ('order_count', 'sum'),
    ).reset_index()
    df_trade_features['stock_id'] = stock_id

    return df_trade_features

def load_data(stock_id: int, data_type: str, train_test: str):
    file_dir = DATA_DIR.joinpath(f"{data_type}_{train_test}.parquet").joinpath(f"stock_id={stock_id}")
    file_name = os.listdir(file_dir)[0]
    full_path = file_dir.joinpath(file_name)
    return full_path


def for_joblib(stock_id, train_test):
    df_bk = engineer_book_features(stock_id, train_test)
    df_tr = engineer_trade_features(stock_id, train_test)
    
    df_tmp = pd.merge(df_bk, df_tr, on=['stock_id', 'time_id'], how='left')

    return df_tmp

def engineer_features(stock_ids: List[int], train_test: str):
    dfs = Parallel(n_jobs=-1, verbose=1)(delayed(for_joblib)(stock_id, train_test) for stock_id in stock_ids)
    df = pd.concat(dfs, ignore_index=True)
    df["row_id"] = df["stock_id"].astype(str) + "-" + df["time_id"].astype(str)
    return df

def engineer_data(load_local: bool = False, save: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if load_local:
        df_train = pd.read_parquet(DATA_DIR.joinpath("train_features.parquet"))
        df_test = pd.read_parquet(DATA_DIR.joinpath("test_features.parquet"))
        return df_train, df_test
    
    df_train_targets = pd.read_csv(DATA_DIR.joinpath("train.csv"))
    train_ids = df_train_targets["stock_id"].unique()
    df_train = engineer_features(train_ids, "train")
    df_train = pd.merge(df_train, df_train_targets, on=['stock_id', 'time_id'], how='inner')

    test_df = pd.read_csv(DATA_DIR.joinpath("test.csv"))
    test_ids = test_df["stock_id"].unique()
    df_test = engineer_features(test_ids, "test")

    if save:
        df_train.to_parquet(DATA_DIR.joinpath("train_features.parquet"))
        df_test.to_parquet(DATA_DIR.joinpath("test_features.parquet"))

    return df_train, df_test

In [None]:
df_train, df_test = engineer_data(load_local=True)

(1, 25)

## Modelling

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_rmspe(y_pred, lgb_train: lgb.Dataset):
    y_true = lgb_train.get_label()
    return 'RMSPE', round(rmspe(y_true, y_pred),8), False

rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

def calc_model_importance(model, feature_names=None, importance_type='gain'):
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df

def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(8, 12)):
    _, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()


In [161]:
train, valid = train_test_split(df_train, test_size=0.2, random_state=42)

X_train = train.drop(columns=['row_id', 'target', 'time_id']).values
y_train = train['target'].values

X_valid = valid.drop(columns=['row_id', 'target', 'time_id']).values
y_valid = valid['target'].values


In [None]:
from typing import Dict, Any, Tuple
from dataclasses import dataclass
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm


@dataclass
class ModelResult:
    model: lgb.Booster
    params: Dict[str, Any]
    train_rmspe: float
    valid_rmspe: float


@dataclass
class GridSearchResult:
    search_results: pd.DataFrame
    best_model: ModelResult


def mlp_trainer(X_train, y_train, X_valid, y_valid, params) -> ModelResult:
    model = MLPRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_rmspe = rmspe(y_train, y_train_pred)
    valid_rmspe = rmspe(y_valid, y_valid_pred)
    
    return ModelResult(model, params, train_rmspe, valid_rmspe)

def lr_trainer(X_train, y_train, X_valid, y_valid, params) -> ModelResult:
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_rmspe = rmspe(y_train, y_train_pred)
    valid_rmspe = rmspe(y_valid, y_valid_pred)

    return ModelResult(model, params, train_rmspe, valid_rmspe)

def lgbm_trainer(X_train, y_train, X_valid, y_valid, params) -> ModelResult:
    X_train_lgb = lgb.Dataset(X_train, y_train, categorical_feature=["stock_id"], weight=1/np.square(y_train))
    X_valid_lgb = lgb.Dataset(X_valid, y_valid, categorical_feature=["stock_id"], weight=1/np.square(y_valid))

    model = lgb.train(
        params, 
        X_train_lgb, 
        valid_sets=[X_train_lgb, X_valid_lgb], 
        num_boost_round=1000, 
        feval=feval_rmspe
    )
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_rmspe = rmspe(y_train, y_train_pred)
    valid_rmspe = rmspe(y_valid, y_valid_pred)

    return ModelResult(model, params, train_rmspe, valid_rmspe)


def cv_train(
    _model_trainer: callable,
    kf: KFold, 
    X: pd.DataFrame, 
    y: pd.DataFrame,
    params: Dict[str, Any]
) -> Tuple[Dict[str, Any], ModelResult]:
    """Cross Validation Run"""
    cv_models: List[ModelResult] = []

    for train_idx, valid_idx in tqdm(kf.split(X)):
        X_train, y_train = X.loc[train_idx], y[train_idx]
        X_valid, y_valid = X.loc[valid_idx], y[valid_idx]

        model_res = _model_trainer(X_train, y_train, X_valid, y_valid, params)
        cv_models.append(model_res)

    cv_results = params.copy()
    cv_results.update({
        "train_rmspe": np.mean([m.train_rmspe for m in cv_models]),
        "valid_rmspe": np.mean([m.valid_rmspe for m in cv_models]),
    })
    best_cv_model = cv_models[np.argmax([m.valid_rmspe for m in cv_models])]
    
    return GridSearchResult(pd.DataFrame([cv_results]), best_cv_model)


def grid_search(
    _model_trainer: callable,
    X: pd.DataFrame, 
    y: pd.DataFrame,
    search_grid: Dict[str, List[Any]],
    n_iter: int, 
    cv: int = 1, 
    randomise: bool = True,
    n_jobs: int = -1
) -> GridSearchResult:
    """ Generalized Grid Searcher"""
    
    # Construct the search grid
    param_combinations = [
        dict(zip(search_grid.keys(), p))
        for p in itertools.product(*search_grid.values())
    ]
    if randomise:
        param_combinations = np.random.choice(param_combinations, n_iter)

    # Setup the cross validation
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    # Run the grid search
    results: List[GridSearchResult] = Parallel(n_jobs=-n_jobs)(delayed(cv_train)(_model_trainer, kf, X, y, params) for params in tqdm(param_combinations))

    # Collect the results
    search_results = pd.concat([r.search_results for r in results], ignore_index=True)
    best_model = results[np.argmax([r.best_model.valid_rmspe for r in results])].best_model

    return GridSearchResult(search_results, best_model)


In [None]:
df_train = pd.read_parquet(DATA_DIR.joinpath("train_features.parquet"))
df_test = pd.read_parquet(DATA_DIR.joinpath("test_features.parquet"))

In [16]:
df_train

Unnamed: 0,time_id,spread_ratio_mean,spread_ratio_std,volume_imbalance,rvol_0_0.2_1,rvol_0_0.2_3,rvol_0_0.2_5,rvol_0_0.6_1,rvol_0_0.6_3,rvol_0_0.6_5,...,rvol_0.15_0.6_3,rvol_0.15_0.6_5,rvol_0.15_1_1,rvol_0.15_1_3,rvol_0.15_1_5,stock_id,trade_volume,trade_count,row_id,target
0,5,8.522233,2.115166,134.894040,0.001141,0.002906,0.004391,0.002651,0.005089,0.006629,...,0.004688,0.006222,0.003942,0.005861,0.007139,0,3179.0,110.0,0-5,0.004136
1,11,3.942811,1.572313,142.050000,0.001013,0.002489,0.003464,0.001204,0.002011,0.002148,...,0.001892,0.002001,0.001020,0.001454,0.001489,0,1289.0,57.0,0-11,0.001445
2,16,7.254260,1.636399,141.414894,0.000622,0.001629,0.002526,0.001356,0.002629,0.003570,...,0.002450,0.003421,0.002165,0.003070,0.003893,0,2161.0,68.0,0-16,0.002168
3,31,8.608392,2.802506,146.216667,0.001037,0.002614,0.003759,0.001764,0.003556,0.004645,...,0.003452,0.004520,0.002463,0.004044,0.004972,0,1962.0,59.0,0-31,0.002195
4,62,3.972501,1.300607,123.846591,0.000718,0.001767,0.002465,0.001238,0.002352,0.002963,...,0.002140,0.002689,0.001692,0.002420,0.002888,0,1791.0,89.0,0-62,0.001747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,32751,8.783802,2.352351,161.638710,0.000957,0.002344,0.003396,0.002232,0.004236,0.005424,...,0.003852,0.004991,0.003188,0.004795,0.005820,126,2570.0,103.0,126-32751,0.003461
428928,32753,7.058147,2.279731,150.578475,0.000981,0.002330,0.003395,0.002312,0.004043,0.005178,...,0.003764,0.004859,0.003622,0.004824,0.005783,126,2323.0,147.0,126-32753,0.003113
428929,32758,7.392257,2.399711,254.406250,0.001371,0.003340,0.004678,0.002179,0.003865,0.004649,...,0.003668,0.004424,0.002990,0.004149,0.004785,126,3740.0,98.0,126-32758,0.004070
428930,32763,5.301715,1.718243,145.654135,0.000983,0.002440,0.003596,0.002229,0.004270,0.005619,...,0.004083,0.005462,0.003281,0.004994,0.006290,126,9389.0,234.0,126-32763,0.003357


In [202]:
search_grid = {
    "objective": ["rmse"], 
    "metric": ["rmse"], 
    'early_stopping_rounds': [75],
    "num_leaves": [500, 1000, 5000],
    "max_depth": [50, 100],
    "learning_rate": [0.001, 0.01, 0.1],
    "reg_alpha": [0, 0.01],
}

x_data = df_train.drop(columns=['row_id', 'target', 'time_id'])
y_data = df_train['target']


gs_res = grid_search(
    lgbm_trainer, x_data, y_data, search_grid, n_iter=32, cv=4, randomise=True
)
gs_res.search_results

  0%|          | 0/32 [00:00<?, ?it/s]

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.000645878	training's RMSPE: 0.299269	valid_1's rmse: 0.000660417	valid_1's RMSPE: 0.304579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Did not meet early stopping

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[453]	training's rmse: 0.000441867	training's RMSPE: 0.20474	valid_1's rmse: 0.000530482	valid_1's RMSPE: 0.244654
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[359]	t

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[27]	training's rmse: 0.000387375	training's RMSPE: 0.179491	valid_1's rmse: 0.000537508	valid_1's RMSPE: 0.247894
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't 

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[514]	training's rmse: 0.000463402	training's RMSPE: 0.214718	valid_1's rmse: 0.000528107	valid_1's RMSPE: 0.243558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[417]	

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[488]	training's rmse: 0.000465528	training's RMSPE: 0.215703	valid_1's rmse: 0.000528392	valid_1's RMSPE: 0.24369
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't 

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[275]	training's rmse: 0.000385475	training's RMSPE: 0.178611	valid_1's rmse: 0.000534703	valid_1's RMSPE: 0.246601
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[256]	

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[514]	training's rmse: 0.000463402	training's RMSPE: 0.214718	valid_1's rmse: 0.000528107	valid_1's RMSPE: 0.243558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[417]	

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009874 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[49]	training's rmse: 0.000465971	training's RMSPE: 0.215908	valid_1's rmse: 0.000532043	valid_1's RMSPE: 0.245374
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Early stopping, best iteration is:
[34]	tr

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 75 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.000604577	training's RMSPE: 0.280132	valid_1's rmse: 0.00064821	valid_1's RMSPE: 0.298949
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5977
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 24
[LightGBM] [Info] Start training from score 0.001804
Training until validation scores don't improve for 75 rounds
Did not meet early stopping.

KeyboardInterrupt: 

## Evaluation