# side model feature selection

In [1]:
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(15677, 3447)


adx_7                               0
mod_rsi_lag17                       0
mod_rsi_lag19                       0
mod_rsi_dt_lag1                     0
mod_rsi_dt_lag2                     0
                                   ..
comb_spectrum_dom_cycle_ddt_lag5    0
comb_spectrum_dom_cycle_ddt_lag6    0
comb_spectrum_dom_cycle_ddt_lag7    0
comb_spectrum_dom_cycle_ddt_lag8    0
williams_r_ddt_lag19                0
Length: 3447, dtype: int64

In [2]:
import numpy as np

label = np.load("data/side_label.npy")

print(label.shape)

len_gap = len(label) - len(df_features)

label = label[len_gap:]

print(label.shape)


(15741,)
(15677,)


In [12]:
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, label)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: williams_r (最大重要性: 0.0465)
➤ 计算特征冗余度...
➤ 总计选择689个特征 (已选择1个，还需选择688个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/688 [00:00<?, ?特征/s]


✅ 特征选择完成：从3447个特征中选择了689个，舍弃了2758个
✅ 选择的特征: ['williams_r', 'adx_7_ddt', 'adaptive_stochastic_ddt', 'conv_0', 'vwap_dt', 'ehlers_early_onset_trend_ddt', 'cmma', 'ehlers_early_onset_trend_ddt_lag1', 'evenbetter_sinewave_short_dt', 'vwap_dt_lag1', 'fisher_dt', 'cmma_lag1', 'roofing_filter_ddt', 'stc_dt', 'norm_on_balance_volume', 'trendflex_dt', 'stc', 'williams_r_dt', 'acr', 'trendflex_dt_lag1', 'adx_14_dt', 'fisher_dt_lag1', 'acc_swing_index_dt', 'evenbetter_sinewave_long_dt', 'roofing_filter_dt', 'reactivity', 'voss_filt_dt', 'fisher_ddt', 'fisher_ddt_lag1', 'evenbetter_sinewave_short_dt_lag1', 'cmma_dt', 'iqr_ratio_dt', 'roofing_filter_ddt_lag1', 'williams_r_lag1', 'evenbetter_sinewave_short', 'stc_dt_lag1', 'adx_7_dt_lag1', 'voss', 'trendflex_ddt_lag1', 'natr_dt', 'aroon_diff', 'voss_dt_lag1', 'reactivity_dt', 'evenbetter_sinewave_long', 'adaptive_cci_dt', 'homodyne_ddt', 'ehlers_early_onset_trend_dt', 'stc_dt_lag2', 'highpass_bp_ddt_lag2', 'bandpass_dt', 'fisher', 'adaptive_stochas

<custom_indicators.toolbox.feature_selection.rfcq_selector.RFCQSelector at 0x137685ed0>

In [15]:
side_res_long = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
print((side_res_long > 0).sum())
side_res_long

495


williams_r                      0.046539
cmma                            0.038528
vwap_dt                         0.032372
fisher_dt                       0.032203
evenbetter_sinewave_short_dt    0.030919
                                  ...   
conv_20                         0.000000
conv_21                         0.000000
conv_22                         0.000000
conv_23                         0.000000
williams_r_ddt_lag19            0.000000
Length: 3447, dtype: float64

In [17]:
import json

feature_info = {
    "side": {
        "side": [],
    },
    "meta": {
        "meta": [],
        "model": [],
    },
}

feature_long = side_res_long[side_res_long > 0].index.tolist()


feature_info["side"]["side"] = feature_long


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import numpy as np
import pandas as pd

df_features = pd.read_parquet("data/features.parquet")
meta_label = np.load("data/label_meta.npy")
print(df_features.shape)
print(meta_label.shape)

df_features.isna().sum(axis=0).sort_values(ascending=False)

(7996, 3447)
(7996,)


adx_7                               0
mod_rsi_lag17                       0
mod_rsi_lag19                       0
mod_rsi_dt_lag1                     0
mod_rsi_dt_lag2                     0
                                   ..
comb_spectrum_dom_cycle_ddt_lag5    0
comb_spectrum_dom_cycle_ddt_lag6    0
comb_spectrum_dom_cycle_ddt_lag7    0
comb_spectrum_dom_cycle_ddt_lag8    0
williams_r_ddt_lag19                0
Length: 3447, dtype: int64

In [2]:
from strategies.BinanceBtcEntropyBarV1.config import (
    SIDE_LONG,
    SIDE_SHORT,
    get_side_model,
)

model_long = get_side_model(False, "long")
model_short = get_side_model(False, "short")

res_long = model_long.predict(df_features[SIDE_LONG])
res_short = model_short.predict(df_features[SIDE_SHORT])

assert df_features.shape[0] == len(res_long) == len(res_short)

df_features["model_long"] = res_long
df_features["model_short"] = res_short

  import pkg_resources


In [3]:
# meta feature selection
from custom_indicators.toolbox.feature_selection.rfcq_selector import RFCQSelector

selector = RFCQSelector()
selector.fit(df_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性(使用随机森林)...
✓ 选择第1个特征: adx_7_dt (最大重要性: 0.0400)
➤ 计算特征冗余度...
➤ 总计选择689个特征 (已选择1个，还需选择688个)...
➤ 开始MRMR迭代选择过程...


选择特征:   0%|                                                           | 0/688 [00:00<?, ?特征/s]


✅ 特征选择完成：从3449个特征中选择了689个，舍弃了2760个
✅ 选择的特征: ['adx_7_dt', 'acc_swing_index_lag17', 'forecast_oscillator_lag3', 'cmma_dt', 'fti_ddt', 'bekker_parkinson_vol_ddt_lag2', 'reactivity', 'price_variance_ratio_dt', 'amihud_lambda_dt_lag2', 'corwin_schultz_estimator_lag1', 'hasbrouck_lambda_lag7', 'hasbrouck_lambda_ddt', 'kyle_lambda_lag10', 'adx_14_dt_lag1', 'cmma', 'trendflex_ddt', 'natr_lag2', 'price_variance_ratio_dt_lag2', 'price_change_oscillator_dt', 'adx_14_dt', 'acc_swing_index_lag18', 'corwin_schultz_estimator', 'bekker_parkinson_vol_dt_lag2', 'model_short', 'acc_swing_index_lag19', 'iqr_ratio_dt', 'natr_lag16', 'trendflex_dt_lag1', 'price_variance_ratio_dt_lag1', 'evenbetter_sinewave_long', 'natr_lag14', 'hasbrouck_lambda_lag2', 'bekker_parkinson_vol', 'trendflex_dt', 'natr_lag13', 'natr_lag1', 'adx_7_dt_lag1', 'cmma_lag1', 'natr_lag7', 'bekker_parkinson_vol_lag1', 'vwap_lag19', 'model_long', 'vwap_lag7', 'vwap_lag1', 'roll_impact_lag3', 'amihud_lambda_lag16', 'amihud_lambda_lag15', 

In [4]:
(meta_res > 0).sum()

76

In [5]:
import json
from pathlib import Path

feature_info = Path("strategies/BinanceBtcEntropyBarV1/feature_info.json")
if feature_info.exists():
    with open(feature_info, "r") as f:
        feature_info = json.load(f)

feature_info["meta"]["meta"] = []
feature_info["meta"]["model_long"] = []
feature_info["meta"]["model_short"] = []

meta_features = meta_res[meta_res > 0].index.tolist()
for k in meta_features:
    if k == "model_long":
        feature_info["meta"]["model_long"].append(k)
    elif k == "model_short":
        feature_info["meta"]["model_short"].append(k)
    else:
        feature_info["meta"]["meta"].append(k)


with open("strategies/BinanceBtcEntropyBarV1/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)