In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import h5py
import numpy as np
from typing import List, Tuple
from pathlib import Path
import lightgbm as lgb
import pickle

from alber.load_data import read_order_book, read_trades, read_target
from alber.feature_generation import (
    book_preprocessor, 
    get_features_zscore, 
    get_features_ma, 
    get_features_stoch,
    retime_trades,
    decrease_mem_consuming
)
from alber.wf_splitting_data import create_oot, walk_forward_splitting, sample_dates

# 0. Parameters

In [2]:
data_path = Path('../../Storage/alber/data.h5')

In [3]:
def get_only_time_features_vitrine(base: Path) -> pd.DataFrame:
    ob = read_order_book(base / Path('data.h5')).head(1_000)
    trades = read_trades(base / Path('data.h5')).head(1_000)
    print(f'ob.shape == {ob.shape}, trades.shape == {trades.shape}')
    
    ob = book_preprocessor(ob)
    
    # Preapre trade features
    trades['id'] = 0
    trades = get_features_zscore(trades)
    trades = get_features_ma(trades)
    trades = get_features_stoch(trades)
    trades = retime_trades(trades, ob)
    
    # Preprocess result features vitrine
    ob = ob.drop(['id'], axis=1)
    trades = trades.drop(['id'], axis=1)
    features = pd.merge(ob, trades, on=['time'])
    features = features.astype({'time': int})
    features = features.drop_duplicates(['time']).reset_index(drop=True)
    features = decrease_mem_consuming(features, ['time'])
    
    return features


def get_vitrine_for_train(base: Path, list_of_features: List[str]) -> pd.DataFrame:
    
    
    target = read_target(base / Path('result.h5'))
    features = pd.merge(features, target, on=['time'])
    
    return features[['time', 'target'] + list_of_features]

In [4]:
train_vitrine = get_vitrine_for_train(base, ['stoch_k_price_21_1'])
train_vitrine.shape

  trades = pd.merge(trades, sum_w, on=['time'])
  trades = pd.merge(trades, sum_w, on=['time'])


ob.shape == (1000, 9), trades.shape == (1000, 4)
Ret
Sprd
Sprd_Up
Sprd_Down
price
size
order_count
Money


(1000, 3)

In [8]:
train_vitrine.tail(200)

Unnamed: 0,time,target,stoch_k_price_21_1
800,520263,0.000893,20.971563
801,520723,0.000895,34.597157
802,521324,0.000837,34.597157
803,521819,0.000692,38.441559
804,522689,0.000437,41.558441
...,...,...,...
995,665657,-0.000116,80.451126
996,666443,0.000233,80.451126
997,666676,0.000400,80.451126
998,667192,0.000447,80.451126


In [18]:
def prepare_dataset_for_train(train_vitrine: pd.DataFrame, dates: List[int]):
    df = train_vitrine.query(f'time == {dates}').reset_index(drop=True)
    
    ds = lgb.Dataset(
        df.drop(['time', 'target'], axis=1),
        df['target'],
        categorical_feature=[],
    )
    
    return ds



def train(
    train_vitrine: pd.DataFrame, 
    name: str, 
    train_val_ratio: float = 0.1,
    save_path: Path = Path('../saved_models'),
) -> None:
    curr_setting = {
        "verbose_eval": 50,
        "num_boost_round": 500,
        "early_stopping_rounds": 50,
        "params": {
            "num_leaves": 131_072,
            "max_bin": 256,
            "learning_rate": 0.01,
            "objective": "regression",
            "metric": "rmse",
            "max_depth": 6,
            "feature_fraction": 1.0,
            "feature_fraction_bynode": 0.6,
            "bagging_fraction": 1.0
        },
    }
    
    # Splitting train_vitrine into train/val
    train_dates = sorted(list(set(train_vitrine.time)))
    train_val_size = int(train_val_ratio * len(train_dates))
    train_dates, val_dates = sample_dates(train_dates, -train_val_size)
    print(f'len_train_dates == {len(train_dates)}, len_val_dates == {len(val_dates)}')
    
    train = prepare_dataset_for_train(train_vitrine, train_dates)
    val = prepare_dataset_for_train(train_vitrine, val_dates)
    del train_vitrine
    
    # Train the model
    model = lgb.train(
        curr_setting["params"],
        train,
        valid_sets=[train, val],
        verbose_eval=curr_setting["verbose_eval"],
        num_boost_round=curr_setting["num_boost_round"],
        early_stopping_rounds=curr_setting["early_stopping_rounds"],
    )
    
    # Save the model
    save_path.mkdir(exist_ok=True)
    with open(save_path / Path(name + ".pkl"), "wb") as f:
        pickle.dump(model, f, protocol=2)

In [19]:
train(train_vitrine, 'baseline')

len_train_dates == 900, len_val_dates == 100
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 900, number of used features: 1
[LightGBM] [Info] Start training from score 0.000179




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000931164	valid_1's rmse: 0.000915257
[100]	training's rmse: 0.000896753	valid_1's rmse: 0.000883211


[150]	training's rmse: 0.000878313	valid_1's rmse: 0.000867795
[200]	training's rmse: 0.000866673	valid_1's rmse: 0.000860534


[250]	training's rmse: 0.000858863	valid_1's rmse: 0.000857204
[300]	training's rmse: 0.000853087	valid_1's rmse: 0.000854327


[350]	training's rmse: 0.000848513	valid_1's rmse: 0.000852609
[400]	training's rmse: 0.000843544	valid_1's rmse: 0.000850323


[450]	training's rmse: 0.000838753	valid_1's rmse: 0.000847892
[500]	training's rmse: 0.00083408	valid_1's rmse: 0.000844904
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.00083408	valid_1's rmse: 0.000844904
