# meta model

In [1]:
import pandas as pd

df_feature = pd.read_parquet("data/features_d002.parquet")
df_label = pd.read_parquet("data/label_d002.parquet")
df_label.head()

Unnamed: 0_level_0,close,ret,trgt,bin,side
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00+00:00,7177.93,,,0.0,0.0
2020-01-01 00:03:00+00:00,7175.61,,,0.0,0.0
2020-01-01 00:06:00+00:00,7182.06,,,0.0,0.0
2020-01-01 00:09:00+00:00,7183.07,,,0.0,0.0
2020-01-01 00:12:00+00:00,7176.26,,,0.0,0.0


In [2]:
from custom_indicators.selection import META_1M, META_3M, META_15M

invalid_len = 300
date = "2024-09-01"
train_mask = (df_label.index <= date).tolist()
test_mask = (df_label.index > date).tolist()

meta_features_col = META_1M + META_3M + META_15M

train_feature = df_feature[train_mask][meta_features_col].iloc[invalid_len:]
train_label = df_label[train_mask].iloc[invalid_len:]
test_feature = df_feature[test_mask][meta_features_col].iloc[invalid_len:]
test_label = df_label[test_mask].iloc[invalid_len:]

print(train_feature.shape)
print(train_label.shape)
print(test_feature.shape)
print(test_label.shape)

valid_train = train_label["ret"].notna().tolist()
valid_test = test_label["ret"].notna().tolist()

train_feature = train_feature[valid_train]
train_label = train_label[valid_train]["bin"].astype(int)
test_feature = test_feature[valid_test]
test_label = test_label[valid_test]["bin"].astype(int)

print(train_feature.shape)
print(train_label.shape)
print(test_feature.shape)
print(test_label.shape)

test_label.value_counts()

(818101, 696)
(818101, 5)
(72659, 696)
(72659, 5)
(613257, 696)
(613257,)
(54596, 696)
(54596,)


bin
1    29437
0    25159
Name: count, dtype: int64

In [5]:
import lightgbm as lgb
import optuna
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "num_threads": -1,
        "verbose": -1,
        "boosting": trial.suggest_categorical("boosting", ["gbdt", "dart"]),
        "num_leaves": trial.suggest_int("num_leaves", 100, 200),
        "max_depth": trial.suggest_int("max_depth", 30, 50),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 1e-5, 0.01),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-3, 1),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-3, 1),
    }
    dtrain = lgb.Dataset(train_feature, train_label)
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=trial.suggest_int("num_boost_round", 100, 800),
    )
    pred_proba = model.predict(test_feature)
    pred_label = (pred_proba > 0.5).astype(int)
    precision = precision_score(test_label, pred_label)
    recall = recall_score(test_label, pred_label)
    f1 = f1_score(test_label, pred_label)
    auc = roc_auc_score(test_label, pred_proba)
    print(f"precision: {precision}, recall: {recall}, f1: {f1}")
    return auc


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-02-18 17:47:03,469] A new study created in memory with name: no-name-1109d29f-5b8f-4068-ac91-d71f7005426b
Best trial: 0. Best value: 0.684156:   1%|          | 1/100 [01:46<2:56:19, 106.86s/it]

precision: 0.6591320702145445, recall: 0.6428983931786527, f1: 0.6509140311957213
[I 2025-02-18 17:48:50,331] Trial 0 finished with value: 0.684156277033666 and parameters: {'boosting': 'gbdt', 'num_leaves': 200, 'max_depth': 48, 'min_gain_to_split': 0.0069560631957667745, 'min_data_in_leaf': 129, 'lambda_l1': 0.7164359202041282, 'lambda_l2': 0.2870025078125416, 'num_boost_round': 301}. Best is trial 0 with value: 0.684156277033666.


Best trial: 0. Best value: 0.684156:   2%|▏         | 2/100 [04:55<4:12:59, 154.89s/it]

precision: 0.6622769142199194, recall: 0.6252675204674389, f1: 0.6432403152247986
[I 2025-02-18 17:51:58,849] Trial 1 finished with value: 0.6791700339058927 and parameters: {'boosting': 'gbdt', 'num_leaves': 181, 'max_depth': 44, 'min_gain_to_split': 0.002435871599578141, 'min_data_in_leaf': 108, 'lambda_l1': 0.6825842144948354, 'lambda_l2': 0.64153410801934, 'num_boost_round': 658}. Best is trial 0 with value: 0.684156277033666.


Best trial: 2. Best value: 0.694159:   3%|▎         | 3/100 [11:40<7:15:20, 269.28s/it]

precision: 0.6748167169067024, recall: 0.6159934775962225, f1: 0.6440647865312211
[I 2025-02-18 17:58:44,249] Trial 2 finished with value: 0.6941585848345657 and parameters: {'boosting': 'dart', 'num_leaves': 127, 'max_depth': 32, 'min_gain_to_split': 0.006371696316617711, 'min_data_in_leaf': 118, 'lambda_l1': 0.1933451198817051, 'lambda_l2': 0.5703534127302019, 'num_boost_round': 670}. Best is trial 2 with value: 0.6941585848345657.


Best trial: 2. Best value: 0.694159:   4%|▍         | 4/100 [14:51<6:21:24, 238.38s/it]

precision: 0.6654861440724379, recall: 0.6191867377789856, f1: 0.6415021293070073
[I 2025-02-18 18:01:55,253] Trial 3 finished with value: 0.6811853497984433 and parameters: {'boosting': 'gbdt', 'num_leaves': 181, 'max_depth': 39, 'min_gain_to_split': 0.00886013783675626, 'min_data_in_leaf': 69, 'lambda_l1': 0.5703580199340439, 'lambda_l2': 0.4727228410073518, 'num_boost_round': 677}. Best is trial 2 with value: 0.6941585848345657.


Best trial: 2. Best value: 0.694159:   4%|▍         | 4/100 [15:33<6:13:17, 233.31s/it]

[W 2025-02-18 18:02:36,692] Trial 4 failed with parameters: {'boosting': 'dart', 'num_leaves': 120, 'max_depth': 37, 'min_gain_to_split': 0.006961492647097616, 'min_data_in_leaf': 120, 'lambda_l1': 0.4208679419830419, 'lambda_l2': 0.8720583495578544, 'num_boost_round': 319} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/jesse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/h9/d_xpszfn6yz9r8zww450m44w0000gn/T/ipykernel_25218/1035672812.py", line 21, in objective
    model = lgb.train(
            ^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniforge/base/envs/jesse/lib/python3.11/site-packages/lightgbm/engine.py", line 307, in train
    booster.update(fobj=fobj)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/jesse/lib/python3.11/site-packages/lightgbm/basic.py", line 4136, i




KeyboardInterrupt: 

# side models

In [7]:
from custom_indicators.selection import SIDE_1M, SIDE_3M, SIDE_15M

invalid_len = 300
date = "2024-09-01"
train_mask = (df_label.index <= date).tolist()
test_mask = (df_label.index > date).tolist()

side_features_col = SIDE_1M + SIDE_3M + SIDE_15M

train_feature = df_feature[train_mask][side_features_col].iloc[invalid_len:]
train_label = df_label[train_mask].iloc[invalid_len:]
test_feature = df_feature[test_mask][side_features_col].iloc[invalid_len:]
test_label = df_label[test_mask].iloc[invalid_len:]

print(train_feature.shape)
print(train_label.shape)
print(test_feature.shape)
print(test_label.shape)

valid_train = (train_label["side"].astype(int) != 0).tolist()
valid_test = (test_label["side"].astype(int) != 0).tolist()

train_feature = train_feature[valid_train]
train_label = (train_label[valid_train]["side"].astype(int) == 1).astype(int)
test_feature = test_feature[valid_test]
test_label = (test_label[valid_test]["side"].astype(int) == 1).astype(int)

print(train_feature.shape)
print(train_label.shape)
print(test_feature.shape)
print(test_label.shape)

test_label.value_counts()

(818101, 374)
(818101, 5)
(72659, 374)
(72659, 5)
(318731, 374)
(318731,)
(29437, 374)
(29437,)


side
1    14791
0    14646
Name: count, dtype: int64

In [None]:
# TODO: optuna lightgbm