# model L4

In [2]:
import pandas as pd

df_features = pd.read_parquet("data/feat_hard_L4.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(5440, 23507)


bar_duration                    0
acp_pwr_17_curv20_lag3          0
acp_pwr_17_curv20_lag1          0
acp_pwr_16_curv20_lag5          0
acp_pwr_16_curv20_lag4          0
                               ..
comb_spectrum_30_mean20_lag2    0
comb_spectrum_30_mean20_lag1    0
comb_spectrum_29_mean20_lag5    0
comb_spectrum_29_mean20_lag4    0
vmd_w256_2_ddt_lag5             0
Length: 23507, dtype: int64

In [3]:
df_features.head(1)

Unnamed: 0_level_0,bar_duration,adx_7,adx_14,aroon_diff,ac_0,ac_1,ac_2,ac_3,ac_4,ac_5,...,vmd_w256_1_ddt_lag1,vmd_w256_1_ddt_lag2,vmd_w256_1_ddt_lag3,vmd_w256_1_ddt_lag4,vmd_w256_1_ddt_lag5,vmd_w256_2_ddt_lag1,vmd_w256_2_ddt_lag2,vmd_w256_2_ddt_lag3,vmd_w256_2_ddt_lag4,vmd_w256_2_ddt_lag5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1661518620000,90.0,20.178637,17.321815,42.857143,0.932462,0.871685,0.812335,0.761803,0.722974,0.696197,...,-44.052622,56.388157,31.109218,-111.849236,66.938945,27.155345,-20.571826,3.643325,7.348517,-4.117138


In [4]:
import json

feature_info = {"fracdiff": [i for i in df_features.columns if i.startswith("frac") and i.endswith("diff")],}

with open("strategies/BinanceBtcDeapV1Voting/models/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

In [5]:
import numpy as np

label = np.load("data/label_hard_L4.npy")

print(label.shape)

(5440,)


In [6]:
from jesse.helpers import date_to_timestamp

train_mask = df_features.index.to_numpy() < date_to_timestamp('2025-03-01')
test_mask = df_features.index.to_numpy() >= date_to_timestamp('2025-03-01')

train_x = df_features[train_mask]
train_y = label[train_mask]
test_x = df_features[test_mask]
test_y = label[test_mask]

print(train_x.shape)
print(test_x.shape)

  import pkg_resources


(4632, 23507)
(808, 23507)


In [7]:
# train deep ssm model
from src.models.deep_ssm import DeepSSMConfig, DeepSSM

features = [i for i in df_features.columns if i.startswith('frac_') and i.endswith('_diff')]
assert len(features) > 0

deep_ssm_config = DeepSSMConfig(
    obs_dim=len(features),
)

deep_ssm_model = DeepSSM(config=deep_ssm_config)
deep_ssm_model.fit(train_x[features])

Epoch 10/50 | Train Loss: 109.3612
Epoch 20/50 | Train Loss: 101.4140
Epoch 30/50 | Train Loss: 95.8559
Epoch 40/50 | Train Loss: 87.7005
Epoch 50/50 | Train Loss: 76.4519


<src.models.deep_ssm.deep_ssm.DeepSSM at 0x309a0bb10>

In [8]:
deep_ssm_model.save("strategies/BinanceBtcDeapV1Voting/models/deep_ssm")

Model saved to strategies/BinanceBtcDeapV1Voting/models/deep_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/deep_ssm.json


In [9]:
feat_deep_ssm = deep_ssm_model.transform(train_x[features])
feat_deep_ssm.shape

(4632, 5)

In [10]:
# train lg ssm
from src.models.lgssm import LGSSM, LGSSMConfig

lg_ssm_config = LGSSMConfig(
    obs_dim=len(features),
)
lg_ssm_model = LGSSM(config=lg_ssm_config)
lg_ssm_model.fit(train_x[features])

Epoch 10/50 | ELBO: -268.5125
Epoch 20/50 | ELBO: -182.3199
Epoch 30/50 | ELBO: -145.2410
Epoch 40/50 | ELBO: -125.7715
Epoch 50/50 | ELBO: -108.7511


LGSSM(
  (kalman_filter): KalmanFilter()
)

In [11]:
lg_ssm_model.save("strategies/BinanceBtcDeapV1Voting/models/lg_ssm")

Model saved to strategies/BinanceBtcDeapV1Voting/models/lg_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/lg_ssm.json


In [12]:
feat_lg_ssm = lg_ssm_model.predict(train_x[features])
feat_lg_ssm.shape

(4632, 5)

In [13]:
df_deep_ssm = pd.DataFrame(feat_deep_ssm, columns=[f"deep_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)
df_lg_ssm = pd.DataFrame(feat_lg_ssm, columns=[f"lg_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)

df_feat_mix = pd.concat([df_deep_ssm, df_lg_ssm, train_x], axis=1)
df_feat_mix.head(1)

Unnamed: 0_level_0,deep_ssm_0,deep_ssm_1,deep_ssm_2,deep_ssm_3,deep_ssm_4,lg_ssm_0,lg_ssm_1,lg_ssm_2,lg_ssm_3,lg_ssm_4,...,vmd_w256_1_ddt_lag1,vmd_w256_1_ddt_lag2,vmd_w256_1_ddt_lag3,vmd_w256_1_ddt_lag4,vmd_w256_1_ddt_lag5,vmd_w256_2_ddt_lag1,vmd_w256_2_ddt_lag2,vmd_w256_2_ddt_lag3,vmd_w256_2_ddt_lag4,vmd_w256_2_ddt_lag5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1661518620000,-0.002385,0.032907,0.03649,-0.01428,0.043822,-0.731821,0.640844,-0.841441,-1.194446,-1.401191,...,-44.052622,56.388157,31.109218,-111.849236,66.938945,27.155345,-20.571826,3.643325,7.348517,-4.117138


In [14]:
from src.features.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector(max_features=3000)
selector.fit(df_feat_mix, train_y)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: cmma (最大重要性: 0.0145)
➤ 计算特征冗余度...
➤ 总计选择3000个特征 (已选择1个，还需选择2999个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                          | 0/2999 [00:00<?, ?特征/s]


✅ 特征选择完成：从23517个特征中选择了3000个，舍弃了20517个
✅ 选择的特征: ['cmma', 'conv_0_dt', 'dft_36_mean20_lag3', 'fisher_ddt', 'ehlers_early_onset_trend_ddt', 'voss_0', 'vwap_dt', 'frac_c_h5_diff', 'forecast_oscillator_mean20', 'adx_7_dt', 'frac_c_l3_diff', 'trendflex_dt', 'frac_c_o1_diff_mean20', 'cmma_curv20_lag5', 'vwap_dt_lag1', 'williams_r', 'frac_c_h3_diff', 'frac_c_h1_diff_mean20', 'fisher_dt', 'frac_c_l5_diff', 'frac_c_l3_diff_mean20', 'vmd_w256_0_dt', 'frac_l_h3_diff', 'lg_ssm_0', 'vmd_w128_0', 'frac_c_l4_diff', 'evenbetter_sinewave_short_dt', 'frac_c_l1_diff_mean20', 'frac_l_l5_diff', 'stc_dt', 'frac_c_o3_diff', 'roofing_filter_ddt_lag1', 'frac_c_c1_diff_mean20', 'frac_l_l1_diff_mean20', 'frac_h_l5_diff', 'frac_c_o2_diff_mean20', 'frac_h_l3_diff', 'frac_c_h2_diff_mean20', 'frac_l_c5_diff', 'frac_c_h5_diff_lag1', 'frac_c_h4_diff', 'reactivity', 'frac_c_c2_diff_mean20', 'frac_c_c3_diff', 'conv_2_dt', 'frac_c_o4_diff', 'frac_l_l2_diff_mean20', 'cmma_dt', 'frac_l_o5_diff_lag1', 'frac_o_h4_diff', 'fra

<src.features.feature_selection.rfcq_selector.RFCQSelector at 0x3301c1750>

In [15]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

2179


cmma                          0.014531
voss_0                        0.012341
frac_c_h5_diff                0.012316
trendflex_dt                  0.012310
vwap_dt                       0.011890
                                ...   
frac_h_c5_diff_mean20_lag3    0.000000
frac_h_c5_diff_mean20_lag2    0.000000
frac_h_c5_diff_mean20_lag1    0.000000
frac_h_c4_diff_mean20_lag5    0.000000
vmd_w256_2_ddt_lag5           0.000000
Length: 23517, dtype: float64

In [16]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcDeapV1Voting/models/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["L4"] = []
feature_long = side_res_long[side_res_long > 0].index.tolist()
feature_info["L4"] = feature_long

with open(feature_info_path, "w") as f:
    json.dump(feature_info, f, indent=4)

# model L5

In [17]:
import pandas as pd

df_features = pd.read_parquet("data/feat_hard_L5.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(5439, 23507)


bar_duration                    0
acp_pwr_17_curv20_lag3          0
acp_pwr_17_curv20_lag1          0
acp_pwr_16_curv20_lag5          0
acp_pwr_16_curv20_lag4          0
                               ..
comb_spectrum_30_mean20_lag2    0
comb_spectrum_30_mean20_lag1    0
comb_spectrum_29_mean20_lag5    0
comb_spectrum_29_mean20_lag4    0
vmd_w256_2_ddt_lag5             0
Length: 23507, dtype: int64

In [18]:
import numpy as np

label = np.load("data/label_hard_L5.npy")

print(label.shape)

(5439,)


In [19]:
from jesse.helpers import date_to_timestamp

train_mask = df_features.index.to_numpy() < date_to_timestamp('2025-03-01')
test_mask = df_features.index.to_numpy() >= date_to_timestamp('2025-03-01')

train_x = df_features[train_mask]
train_y = label[train_mask]
test_x = df_features[test_mask]
test_y = label[test_mask]

print(train_x.shape)
print(test_x.shape)

(4632, 23507)
(807, 23507)


In [21]:
features = [i for i in df_features.columns if i.startswith('frac_') and i.endswith('_diff')]
assert len(features) > 0

deep_ssm_model = DeepSSM.load("strategies/BinanceBtcDeapV1Voting/models/deep_ssm")
feat_deep_ssm = deep_ssm_model.transform(train_x[features])
feat_deep_ssm.shape

Model loaded from strategies/BinanceBtcDeapV1Voting/models/deep_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/deep_ssm.json


(4632, 5)

In [22]:
lg_ssm_model = LGSSM.load("strategies/BinanceBtcDeapV1Voting/models/lg_ssm")
feat_lg_ssm = lg_ssm_model.predict(train_x[features])
feat_lg_ssm.shape

Model loaded from strategies/BinanceBtcDeapV1Voting/models/lg_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/lg_ssm.json


(4632, 5)

In [23]:
df_deep_ssm = pd.DataFrame(feat_deep_ssm, columns=[f"deep_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)
df_lg_ssm = pd.DataFrame(feat_lg_ssm, columns=[f"lg_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)

df_feat_mix = pd.concat([df_deep_ssm, df_lg_ssm, train_x], axis=1)
df_feat_mix.head(1)

Unnamed: 0_level_0,deep_ssm_0,deep_ssm_1,deep_ssm_2,deep_ssm_3,deep_ssm_4,lg_ssm_0,lg_ssm_1,lg_ssm_2,lg_ssm_3,lg_ssm_4,...,vmd_w256_1_ddt_lag1,vmd_w256_1_ddt_lag2,vmd_w256_1_ddt_lag3,vmd_w256_1_ddt_lag4,vmd_w256_1_ddt_lag5,vmd_w256_2_ddt_lag1,vmd_w256_2_ddt_lag2,vmd_w256_2_ddt_lag3,vmd_w256_2_ddt_lag4,vmd_w256_2_ddt_lag5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1661518620000,-0.002385,0.032907,0.03649,-0.01428,0.043822,-0.731821,0.640844,-0.841441,-1.194446,-1.401191,...,-44.052622,56.388157,31.109218,-111.849236,66.938945,27.155345,-20.571826,3.643325,7.348517,-4.117138


In [24]:
from src.features.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector(max_features=3000)
selector.fit(df_feat_mix, train_y)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: frac_c_h5_diff (最大重要性: 0.0127)
➤ 计算特征冗余度...
➤ 总计选择3000个特征 (已选择1个，还需选择2999个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                          | 0/2999 [00:00<?, ?特征/s]


✅ 特征选择完成：从23517个特征中选择了3000个，舍弃了20517个
✅ 选择的特征: ['frac_c_h5_diff', 'cwt_w256_8_curv20_lag5', 'williams_r_lag1', 'ehlers_early_onset_trend_ddt', 'frac_o_l1_diff_dt_lag1', 'voss_0', 'vwap_dt', 'cmma', 'frac_c_l3_diff', 'vwap_dt_lag1', 'forecast_oscillator_mean20', 'trendflex_dt', 'frac_c_l5_diff', 'adx_7_dt', 'williams_r', 'frac_l_c5_diff', 'dft_9_mean20_lag3', 'frac_c_o1_diff_mean20', 'frac_c_h5_diff_lag1', 'frac_c_o3_diff', 'roofing_filter_ddt_lag1', 'lg_ssm_0', 'conv_22_hurst20', 'frac_c_h1_diff_mean20', 'frac_l_l5_diff', 'stc_dt', 'conv_2_dt', 'fisher_dt', 'frac_c_c4_diff', 'conv_40_phent20', 'frac_l_h3_diff', 'frac_l_o5_diff_lag1', 'vmd_w128_0', 'frac_c_l4_diff', 'fisher', 'frac_c_h4_diff', 'frac_c_l1_diff_mean20', 'evenbetter_sinewave_short_dt', 'frac_c_c2_diff_mean20', 'frac_c_o5_diff', 'acp_pwr_35_ddt_lag3', 'frac_c_o4_diff', 'frac_l_l3_diff_mean20', 'frac_o_h4_diff', 'ac_23_std20_lag3', 'cmma_dt', 'frac_c_h2_diff_mean20', 'fisher_dt_lag1', 'cwt_w64_14_dt', 'frac_h_l5_diff', 'fra

<src.features.feature_selection.rfcq_selector.RFCQSelector at 0x338813590>

In [25]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

2507


frac_c_h5_diff                   0.012711
voss_0                           0.012688
trendflex_dt                     0.012044
cmma                             0.011909
vwap_dt                          0.010559
                                   ...   
williams_r_mean20_lag3           0.000000
williams_r_mean20_lag2           0.000000
williams_r_mean20_lag1           0.000000
vwap_mean20_lag5                 0.000000
swamicharts_rsi_17_std20_lag2    0.000000
Length: 23517, dtype: float64

In [26]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcDeapV1Voting/models/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["L5"] = []
feature_long = side_res_long[side_res_long > 0].index.tolist()
feature_info["L5"] = feature_long

with open(feature_info_path, "w") as f:
    json.dump(feature_info, f, indent=4)

# model L6

In [27]:
import pandas as pd

df_features = pd.read_parquet("data/feat_hard_L6.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(5438, 23507)


bar_duration                    0
acp_pwr_17_curv20_lag3          0
acp_pwr_17_curv20_lag1          0
acp_pwr_16_curv20_lag5          0
acp_pwr_16_curv20_lag4          0
                               ..
comb_spectrum_30_mean20_lag2    0
comb_spectrum_30_mean20_lag1    0
comb_spectrum_29_mean20_lag5    0
comb_spectrum_29_mean20_lag4    0
vmd_w256_2_ddt_lag5             0
Length: 23507, dtype: int64

In [28]:
import numpy as np

label = np.load("data/label_hard_L6.npy")

print(label.shape)

(5438,)


In [29]:
from jesse.helpers import date_to_timestamp

train_mask = df_features.index.to_numpy() < date_to_timestamp('2025-03-01')
test_mask = df_features.index.to_numpy() >= date_to_timestamp('2025-03-01')

train_x = df_features[train_mask]
train_y = label[train_mask]
test_x = df_features[test_mask]
test_y = label[test_mask]

print(train_x.shape)
print(test_x.shape)

(4632, 23507)
(806, 23507)


In [31]:
features = [i for i in df_features.columns if i.startswith('frac_') and i.endswith('_diff')]
assert len(features) > 0

deep_ssm_model = DeepSSM.load("strategies/BinanceBtcDeapV1Voting/models/deep_ssm")
feat_deep_ssm = deep_ssm_model.transform(train_x[features])
feat_deep_ssm.shape

Model loaded from strategies/BinanceBtcDeapV1Voting/models/deep_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/deep_ssm.json


(4632, 5)

In [32]:
lg_ssm_model = LGSSM.load("strategies/BinanceBtcDeapV1Voting/models/lg_ssm")
feat_lg_ssm = lg_ssm_model.predict(train_x[features])
feat_lg_ssm.shape

Model loaded from strategies/BinanceBtcDeapV1Voting/models/lg_ssm.safetensors and strategies/BinanceBtcDeapV1Voting/models/lg_ssm.json


(4632, 5)

In [33]:
df_deep_ssm = pd.DataFrame(feat_deep_ssm, columns=[f"deep_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)
df_lg_ssm = pd.DataFrame(feat_lg_ssm, columns=[f"lg_ssm_{i}" for i in range(feat_deep_ssm.shape[1])], index=train_x.index)

df_feat_mix = pd.concat([df_deep_ssm, df_lg_ssm, train_x], axis=1)
df_feat_mix.head(1)

Unnamed: 0_level_0,deep_ssm_0,deep_ssm_1,deep_ssm_2,deep_ssm_3,deep_ssm_4,lg_ssm_0,lg_ssm_1,lg_ssm_2,lg_ssm_3,lg_ssm_4,...,vmd_w256_1_ddt_lag1,vmd_w256_1_ddt_lag2,vmd_w256_1_ddt_lag3,vmd_w256_1_ddt_lag4,vmd_w256_1_ddt_lag5,vmd_w256_2_ddt_lag1,vmd_w256_2_ddt_lag2,vmd_w256_2_ddt_lag3,vmd_w256_2_ddt_lag4,vmd_w256_2_ddt_lag5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1661518620000,-0.002385,0.032907,0.03649,-0.01428,0.043822,-0.731821,0.640844,-0.841441,-1.194446,-1.401191,...,-44.052622,56.388157,31.109218,-111.849236,66.938945,27.155345,-20.571826,3.643325,7.348517,-4.117138


In [34]:
from src.features.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector(max_features=3000)
selector.fit(df_feat_mix, train_y)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: cmma (最大重要性: 0.0171)
➤ 计算特征冗余度...
➤ 总计选择3000个特征 (已选择1个，还需选择2999个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                          | 0/2999 [00:00<?, ?特征/s]


✅ 特征选择完成：从23517个特征中选择了3000个，舍弃了20517个
✅ 选择的特征: ['cmma', 'comb_spectrum_17_lag1', 'fisher_ddt', 'ehlers_early_onset_trend_ddt', 'voss_0', 'amihud_lambda_lag3', 'frac_c_l3_diff', 'vwap_dt', 'frac_c_h5_diff', 'frac_o_h4_diff', 'conv_0_lag1', 'forecast_oscillator_mean20', 'frac_h_l3_diff', 'fisher_curv20', 'cmma_dt', 'evenbetter_sinewave_short_dt', 'frac_c_o1_diff_mean20', 'frac_c_h3_diff', 'roofing_filter_ddt_lag1', 'frac_l_l5_diff', 'acp_pwr_28_ddt', 'williams_r', 'deep_ssm_2', 'frac_c_c4_diff', 'ac_14_std20_lag2', 'frac_c_l5_diff', 'frac_c_h5_diff_lag1', 'frac_l_c4_diff', 'frac_l_o5_diff_lag1', 'swamicharts_rsi_16_std20_lag4', 'cwt_w64_14_dt', 'fisher_dt', 'frac_c_c1_diff_mean20', 'cwt_w32_2_curv20_lag4', 'swamicharts_stochastic_25_std20_lag4', 'frac_l_l1_diff_mean20', 'cwt_w32_11_dt', 'cwt_w256_16_dt', 'fisher_dt_lag1', 'frac_c_o3_diff', 'adx_14_std20_lag2', 'frac_l_h3_diff', 'ac_1_mean20_lag4', 'frac_c_h1_diff_mean20', 'trendflex_dt', 'frac_o_o4_diff', 'cwt_w32_17_dt', 'vwap_dt_lag1'

<src.features.feature_selection.rfcq_selector.RFCQSelector at 0x3332b8a50>

In [35]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

458


cmma                   0.017118
voss_0                 0.016881
frac_c_l3_diff         0.015146
frac_c_h5_diff         0.013944
vwap_dt                0.013017
                         ...   
conv_34_mean20_lag3    0.000000
conv_34_mean20_lag2    0.000000
conv_34_mean20_lag1    0.000000
conv_33_mean20_lag5    0.000000
vmd_w256_2_ddt_lag5    0.000000
Length: 23517, dtype: float64

In [36]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcDeapV1Voting/models/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["L6"] = []
feature_long = side_res_long[side_res_long > 0].index.tolist()
feature_info["L6"] = feature_long

with open(feature_info_path, "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import numpy as np
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
meta_label = np.load("data/label_meta.npy")
print(df_features.shape)
print(meta_label.shape)

df_features.isna().sum(axis=0).sort_values(ascending=False)

(13219, 12127)
(13219,)


adx_7                     0
cwt_win1024_12_dt_lag5    0
cwt_win1024_3_dt_lag5     0
cwt_win1024_4_dt_lag5     0
cwt_win1024_5_dt_lag5     0
                         ..
cwt_win128_20_lag12       0
cwt_win128_0_lag13        0
cwt_win128_1_lag13        0
cwt_win128_2_lag13        0
williams_r_ddt_lag19      0
Length: 12127, dtype: int64

In [2]:
from strategies.BinanceBtcEntropyBarV1.config import (
    SIDE,
    get_side_model,
)

side_model = get_side_model(False)

side_res = side_model.predict(df_features[SIDE])

assert df_features.shape[0] == len(side_res)

df_features["model"] = side_res

In [3]:
# meta feature selection
from src.features.feature_selection import CatFCQSelector

selector = CatFCQSelector()
selector.fit(df_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: model (最大重要性: 6.7817)
➤ 计算特征冗余度...
➤ 总计选择2425个特征 (已选择1个，还需选择2424个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                          | 0/2424 [00:00<?, ?特征/s]


✅ 特征选择完成：从12128个特征中选择了2425个，舍弃了9703个
✅ 选择的特征: ['model', 'voss_filt_ddt_lag7', 'approximate_entropy_win128_spot', 'dft_spectrum_13', 'cwt_win1024_0_ddt_lag18', 'sample_entropy_win256_array', 'stc_ddt_lag19', 'ac_29', 'vmd_win1024_0_lag8', 'phase_accumulation_ddt_lag19', 'sample_entropy_win256_spot', 'cmma', 'fti_ddt_lag6', 'cwt_win512_0_ddt_lag13', 'comb_spectrum_pwr_4', 'bekker_parkinson_vol_ddt_lag14', 'stc_ddt_lag18', 'sample_entropy_win64_array', 'hurst_coef_fast_ddt_lag19', 'approximate_entropy_win256_spot', 'vmd_win32_0_lag1', 'dft_spectrum_9', 'hurst_coef_slow_ddt_lag11', 'comb_spectrum_dom_cycle_ddt_lag5', 'dft_spectrum_3', 'price_change_oscillator_dt_lag1', 'williams_r', 'vmd_win128_1_ddt_lag8', 'fti_dt_lag18', 'entropy_for_jesse_lag10', 'phase_accumulation_dt_lag19', 'dual_diff_lag5', 'iqr_ratio_lag8', 'ac_37', 'stc_dt', 'phase_accumulation_lag18', 'cwt_win32_7_ddt_lag14', 'acp_pwr_0', 'homodyne_lag17', 'stc_ddt_lag17', 'norm_on_balance_volume_lag2', 'cwt_win32_4_ddt_lag8', '

In [6]:
(meta_res > 0).sum()

2664

In [7]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcEntropyBarV1/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["meta"]["meta"] = []
feature_info["meta"]["model"] = []

meta_features = meta_res[meta_res > 0].index.tolist()
for k in meta_features:
    if k == "model":
        feature_info["meta"]["model"].append(k)
    else:
        feature_info["meta"]["meta"].append(k)


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)