# side model feature selection

In [1]:
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(70362, 4887)


adx_7                       0
hurst_coef_fast_dt_lag4     0
hurst_coef_fast_dt_lag11    0
hurst_coef_fast_dt_lag10    0
hurst_coef_fast_dt_lag9     0
                           ..
cwt_6_dt_lag5               0
cwt_6_lag5                  0
cwt_6_ddt_lag4              0
cwt_6_dt_lag4               0
williams_r_ddt_lag19        0
Length: 4887, dtype: int64

In [2]:
import numpy as np

label = np.load("data/side_label.npy")

print(label.shape)

len_gap = len(label) - len(df_features)

label = label[len_gap:]

print(label.shape)

(70426,)
(70362,)


In [3]:
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, label)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: williams_r (最大重要性: 0.0433)
➤ 计算特征冗余度...
➤ 总计选择977个特征 (已选择1个，还需选择976个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/976 [00:00<?, ?特征/s]


✅ 特征选择完成：从4887个特征中选择了977个，舍弃了3910个
✅ 选择的特征: ['williams_r', 'cwt_20_dt_lag19', 'highpass_bp_dt_lag1', 'vwap_dt', 'ehlers_early_onset_trend_ddt', 'vmd_0_dt', 'evenbetter_sinewave_short_dt', 'vwap_dt_lag1', 'cmma', 'evenbetter_sinewave_long_dt', 'vmd_2_lag3', 'fisher_dt', 'conv_0', 'vmd_0_dt_lag1', 'evenbetter_sinewave_long', 'vmd_1_dt_lag2', 'stc_dt', 'reactivity', 'vmd_0_dt_lag3', 'vmd_2_dt_lag1', 'stc_dt_lag1', 'ehlers_early_onset_trend_ddt_lag1', 'williams_r_dt', 'cwt_11_ddt_lag2', 'fisher_dt_lag1', 'cmma_lag2', 'vmd_2_lag4', 'vmd_0_dt_lag2', 'vmd_1_dt_lag1', 'cmma_lag1', 'vmd_1_dt_lag3', 'acc_swing_index_dt', 'vmd_1_ddt_lag4', 'fisher', 'vmd_2_dt', 'vmd_0_dt_lag4', 'vmd_1_lag9', 'stc', 'voss', 'norm_on_balance_volume', 'bekker_parkinson_vol_dt_lag1', 'trendflex_dt', 'williams_r_lag1', 'roofing_filter_ddt', 'adx_7_dt_lag1', 'vmd_2_ddt_lag3', 'vmd_1_ddt_lag3', 'vmd_1_ddt_lag7', 'vmd_2_dt_lag8', 'vmd_1_ddt_lag5', 'evenbetter_sinewave_short_ddt', 'vwap_dt_lag2', 'trendflex', 'vmd_1_dt',

<custom_indicators.toolbox.feature_selection.rfcq_selector.RFCQSelector at 0x3f5ab8bd0>

In [4]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

434


williams_r                      0.043272
fisher_dt                       0.027345
cmma                            0.027024
vwap_dt                         0.026660
evenbetter_sinewave_short_dt    0.026367
                                  ...   
cwt_7_ddt_lag5                  0.000000
cwt_7_lag5                      0.000000
cwt_7_ddt_lag4                  0.000000
cwt_7_dt_lag4                   0.000000
williams_r_ddt_lag19            0.000000
Length: 4887, dtype: float64

In [5]:
import json

feature_info = {
    "side": {
        "side": [],
    },
    "meta": {
        "meta": [],
        "model": [],
    },
}

feature_long = side_res_long[side_res_long > 0].index.tolist()


feature_info["side"]["side"] = feature_long


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import numpy as np
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
meta_label = np.load("data/label_meta.npy")
print(df_features.shape)
print(meta_label.shape)

df_features.isna().sum(axis=0).sort_values(ascending=False)

(70362, 4887)
(70362,)


adx_7                       0
hurst_coef_fast_dt_lag4     0
hurst_coef_fast_dt_lag11    0
hurst_coef_fast_dt_lag10    0
hurst_coef_fast_dt_lag9     0
                           ..
cwt_6_dt_lag5               0
cwt_6_lag5                  0
cwt_6_ddt_lag4              0
cwt_6_dt_lag4               0
williams_r_ddt_lag19        0
Length: 4887, dtype: int64

In [2]:
from strategies.BinanceBtcEntropyBarV1.config import (
    SIDE,
    get_side_model,
)

side_model = get_side_model(False)

side_res = side_model.predict(df_features[SIDE])

assert df_features.shape[0] == len(side_res)

df_features["model"] = side_res

In [3]:
# meta feature selection
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: acc_swing_index_lag9 (最大重要性: 0.0588)
➤ 计算特征冗余度...
➤ 总计选择977个特征 (已选择1个，还需选择976个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/976 [00:00<?, ?特征/s]


✅ 特征选择完成：从4888个特征中选择了977个，舍弃了3911个
✅ 选择的特征: ['acc_swing_index_lag9', 'evenbetter_sinewave_long_ddt_lag17', 'cwt_14_ddt_lag13', 'cmma_ddt_lag9', 'kyle_lambda_lag11', 'acc_swing_index_lag16', 'price_change_oscillator_lag15', 'vwap_lag8', 'acc_swing_index_lag13', 'vwap_lag19', 'hasbrouck_lambda_ddt_lag3', 'acc_swing_index_lag11', 'vwap_lag3', 'model', 'voss_lag13', 'acc_swing_index_lag4', 'vwap_lag6', 'vwap_lag11', 'acc_swing_index', 'acc_swing_index_lag7', 'vwap_lag16', 'amihud_lambda_dt_lag9', 'acc_swing_index_lag18', 'cwt_3_ddt_lag12', 'kyle_lambda_lag1', 'vwap_lag18', 'fti_best_period_dt_lag7', 'acc_swing_index_lag12', 'kyle_lambda_lag18', 'cwt_10_ddt_lag13', 'kyle_lambda_lag4', 'acc_swing_index_lag3', 'vwap_lag2', 'hurst_coef_slow_dt_lag10', 'cwt_9_ddt', 'vwap_lag9', 'kyle_lambda_ddt_lag7', 'kyle_lambda_lag17', 'acc_swing_index_lag2', 'corwin_schultz_estimator_ddt_lag15', 'vwap_lag13', 'natr_ddt', 'kyle_lambda_lag19', 'voss_filt_lag7', 'comb_spectrum_dom_cycle_ddt_lag18', 'vwap_lag1

In [4]:
(meta_res > 0).sum()

576

In [6]:
import json
from pathlib import Path

feature_info_path = Path("strategies/BinanceBtcEntropyBarV1/feature_info.json")
if feature_info_path.exists():
    with open(feature_info_path, "r") as f:
        feature_info = json.load(f)

feature_info["meta"]["meta"] = []
feature_info["meta"]["model"] = []

meta_features = meta_res[meta_res > 0].index.tolist()
for k in meta_features:
    if k == "model":
        feature_info["meta"]["model"].append(k)
    else:
        feature_info["meta"]["meta"].append(k)


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)