# side model feature selection

In [1]:
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(34487, 4887)


adx_7                       0
hurst_coef_fast_dt_lag4     0
hurst_coef_fast_dt_lag11    0
hurst_coef_fast_dt_lag10    0
hurst_coef_fast_dt_lag9     0
                           ..
cwt_6_dt_lag5               0
cwt_6_lag5                  0
cwt_6_ddt_lag4              0
cwt_6_dt_lag4               0
williams_r_ddt_lag19        0
Length: 4887, dtype: int64

In [2]:
import numpy as np

label = np.load("data/side_label.npy")

print(label.shape)

len_gap = len(label) - len(df_features)

label = label[len_gap:]

print(label.shape)

(34551,)
(34487,)


In [3]:
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, label)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: williams_r (最大重要性: 0.0409)
➤ 计算特征冗余度...
➤ 总计选择977个特征 (已选择1个，还需选择976个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/976 [00:00<?, ?特征/s]


✅ 特征选择完成：从4887个特征中选择了977个，舍弃了3910个
✅ 选择的特征: ['williams_r', 'cwt_13_ddt_lag3', 'ehlers_early_onset_trend_ddt_lag1', 'vwap_dt', 'vmd_1_ddt_lag4', 'vwap_dt_lag1', 'vmd_0_dt', 'evenbetter_sinewave_short_dt', 'cmma', 'conv_0', 'evenbetter_sinewave_long_dt', 'fisher_dt', 'evenbetter_sinewave_long', 'vmd_0_dt_lag1', 'ehlers_early_onset_trend_ddt', 'vmd_1_dt_lag2', 'reactivity', 'stc_dt_lag1', 'vmd_2_lag4', 'vmd_0_dt_lag3', 'williams_r_dt', 'fisher_dt_lag1', 'vwap_dt_lag2', 'stc_dt', 'acr', 'cmma_lag1', 'vmd_1_dt_lag1', 'vmd_0_dt_lag2', 'vmd_1_ddt_lag3', 'adx_7_dt_lag1', 'fisher', 'vmd_2_dt', 'vmd_1_dt_lag3', 'norm_on_balance_volume', 'stc', 'vmd_1_ddt_lag5', 'acc_swing_index_dt', 'voss', 'vmd_1_lag9', 'cmma_lag2', 'trendflex_dt', 'vmd_0_dt_lag4', 'roofing_filter_ddt', 'williams_r_lag1', 'vmd_1_lag6', 'vmd_0_lag13', 'vmd_2_ddt_lag2', 'evenbetter_sinewave_short_ddt', 'vmd_1_dt', 'vmd_0_dt_lag6', 'vmd_2_lag3', 'trendflex', 'vmd_1_ddt_lag7', 'bekker_parkinson_vol_ddt_lag1', 'vmd_0_lag16', 'willi

<custom_indicators.toolbox.feature_selection.rfcq_selector.RFCQSelector at 0x3816dc0d0>

In [4]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

454


williams_r              0.040890
vwap_dt                 0.038528
fisher_dt               0.026936
cmma                    0.025467
vmd_0_dt                0.024445
                          ...   
cwt_7_lag8              0.000000
cwt_7_ddt_lag7          0.000000
cwt_7_dt_lag7           0.000000
cwt_7_lag7              0.000000
williams_r_ddt_lag19    0.000000
Length: 4887, dtype: float64

In [5]:
import json

feature_info = {
    "side": {
        "side": [],
    },
    "meta": {
        "meta": [],
        "model": [],
    },
}

feature_long = side_res_long[side_res_long > 0].index.tolist()


feature_info["side"]["side"] = feature_long


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import numpy as np
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
meta_label = np.load("data/label_meta.npy")
print(df_features.shape)
print(meta_label.shape)

df_features.isna().sum(axis=0).sort_values(ascending=False)

(34487, 4887)
(34487,)


adx_7                       0
hurst_coef_fast_dt_lag4     0
hurst_coef_fast_dt_lag11    0
hurst_coef_fast_dt_lag10    0
hurst_coef_fast_dt_lag9     0
                           ..
cwt_6_dt_lag5               0
cwt_6_lag5                  0
cwt_6_ddt_lag4              0
cwt_6_dt_lag4               0
williams_r_ddt_lag19        0
Length: 4887, dtype: int64

In [2]:
from strategies.BinanceBtcEntropyBarV1.config import (
    SIDE,
    get_side_model,
)

side_model = get_side_model(False)

side_res = side_model.predict(df_features[SIDE])

assert df_features.shape[0] == len(side_res)

df_features["model"] = side_res

In [3]:
# meta feature selection
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: model (最大重要性: 0.0390)
➤ 计算特征冗余度...
➤ 总计选择977个特征 (已选择1个，还需选择976个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/976 [00:00<?, ?特征/s]


✅ 特征选择完成：从4888个特征中选择了977个，舍弃了3911个
✅ 选择的特征: ['model', 'cwt_6_ddt_lag3', 'cwt_16_ddt_lag15', 'cwt_11_ddt_lag2', 'amihud_lambda_dt', 'bekker_parkinson_vol_ddt_lag1', 'cwt_19_ddt_lag6', 'cwt_7_ddt_lag6', 'cwt_4_ddt', 'cwt_1_ddt', 'cwt_10_ddt_lag2', 'adx_7', 'cwt_15_ddt_lag6', 'adx_7_dt', 'cwt_11_ddt_lag1', 'cwt_0_ddt_lag3', 'cwt_2_ddt_lag2', 'hurst_coef_slow', 'acc_swing_index_ddt_lag2', 'cwt_17_ddt_lag1', 'williams_r', 'cwt_8_ddt', 'corwin_schultz_estimator_lag1', 'cwt_14_ddt', 'cwt_17_ddt_lag6', 'conv_0', 'cwt_6_ddt_lag2', 'cwt_13_ddt', 'price_variance_ratio_dt', 'cwt_10_dt_lag2', 'roll_impact_dt', 'pfe_dt_lag13', 'cmma', 'cwt_5_ddt_lag4', 'cwt_7_ddt_lag5', 'fti_dt', 'bekker_parkinson_vol_dt_lag2', 'cwt_18_ddt_lag2', 'cwt_1_ddt_lag3', 'vmd_0_ddt', 'adx_7_dt_lag1', 'cwt_19_dt_lag13', 'cwt_10_ddt_lag1', 'cwt_5_ddt_lag2', 'evenbetter_sinewave_long_ddt_lag15', 'bekker_parkinson_vol_ddt', 'bekker_parkinson_vol_lag2', 'cwt_10_ddt_lag9', 'ac_46', 'cwt_2_ddt_lag4', 'vmd_2_lag2', 'cwt_0_ddt_lag

In [4]:
(meta_res > 0).sum()

916

In [5]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcEntropyBarV1/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["meta"]["meta"] = []
feature_info["meta"]["model"] = []

meta_features = meta_res[meta_res > 0].index.tolist()
for k in meta_features:
    if k == "model":
        feature_info["meta"]["model"].append(k)
    else:
        feature_info["meta"]["meta"].append(k)


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)