# side model feature selection

In [None]:
import pandas as pd

df_features = pd.read_parquet("data/features_25m.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

In [None]:
import numpy as np
from jesse import utils

label = np.load("data/label_side_25m.npy")
df_label = utils.numpy_candles_to_dataframe(label[:, :6]).set_index("date")
df_label["side_label"] = label[:, 6]
df_label.head(1)

In [None]:
side_features = df_features.iloc[808:]
side_label = df_label.iloc[808:]

side_label["side_label"] = (side_label["side_label"].astype(int) == 1).astype(int)
side_label = side_label["side_label"].to_numpy()

print(side_features.shape)
print(side_label.shape)

side_features.isna().sum(axis=0).sort_values(ascending=False)

In [None]:
from custom_indicators.mrmr_accelerated import FCQSelector

selector = FCQSelector()
selector.fit(side_features, side_label)

In [None]:
side_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
side_res

In [10]:
import json

with open(f"data/side_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(side_res.to_dict(), f, indent=4)

In [11]:
import json

import pandas as pd

with open(f"data/side_features_FCQSelector.json", "r") as f:
    side_res = pd.Series(json.load(f))

In [12]:
SHORT_TERM = "10m"
MEDIUM_TERM = "25m"
LONG_TERM = "2h"

feature_info = {
    "side": {SHORT_TERM: [], MEDIUM_TERM: [], LONG_TERM: []},
    "meta": {SHORT_TERM: [], MEDIUM_TERM: [], LONG_TERM: [], "model_res": []},
}

for k, v in side_res[: len(side_res) // 5].to_dict().items():
    if k.startswith(f"{SHORT_TERM}_"):
        feature_info["side"][SHORT_TERM].append(k)
    elif k.startswith(f"{MEDIUM_TERM}_"):
        feature_info["side"][MEDIUM_TERM].append(k)
    elif k.startswith(f"{LONG_TERM}_"):
        feature_info["side"][LONG_TERM].append(k)

feature_info["side"][SHORT_TERM] = sorted(list(set(feature_info["side"][SHORT_TERM])))
feature_info["side"][MEDIUM_TERM] = sorted(list(set(feature_info["side"][MEDIUM_TERM])))
feature_info["side"][LONG_TERM] = sorted(list(set(feature_info["side"][LONG_TERM])))

feature_info["meta"][SHORT_TERM] = sorted(list(set(feature_info["meta"][SHORT_TERM])))
feature_info["meta"][MEDIUM_TERM] = sorted(list(set(feature_info["meta"][MEDIUM_TERM])))
feature_info["meta"][LONG_TERM] = sorted(list(set(feature_info["meta"][LONG_TERM])))
feature_info["meta"]["model_res"] = sorted(list(set(feature_info["meta"]["model_res"])))

with open("custom_indicators/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [8]:
import lightgbm as lgb
import numpy as np
import pandas as pd

from custom_indicators.config import SIDE_ALL

df_features = pd.read_parquet("data/features_25m.parquet")
meta_label = np.load("data/label_meta_25m.npy")

side_model = lgb.Booster(model_file="custom_indicators/models/model_side.txt")
side_model_pred_prob = side_model.predict(df_features[SIDE_ALL])
df_features["side_model_res"] = side_model_pred_prob

assert df_features.shape[0] == meta_label.shape[0]
assert "side_model_res" in df_features.columns

df_features.isna().sum(axis=0).sort_values(ascending=False)

2h_voss_ddt_lag9                   808
2h_adaptive_stochastic_ddt_lag5    808
2h_adaptive_stochastic_ddt_lag3    808
2h_adaptive_stochastic_ddt_lag2    808
2h_adaptive_stochastic_ddt_lag1    808
                                  ... 
25m_comb_spectrum_pwr_14             0
25m_comb_spectrum_pwr_13             0
25m_comb_spectrum_pwr_12             0
25m_comb_spectrum_pwr_11             0
side_model_res                       0
Length: 11842, dtype: int64

In [9]:
meta_features = df_features[808:]
meta_label = meta_label[808:]

meta_label = pd.Series(meta_label[:, 6].astype(int), index=meta_features.index)

# meta_features = meta_features[meta_label["ret"].notna()]
# meta_label = meta_label[meta_label["ret"].notna()]["bin"]

print(meta_features.shape)
print(meta_label.shape)

meta_label.value_counts()

(105366, 11842)
(105366,)


1    84431
0    20935
Name: count, dtype: int64

In [10]:
# meta feature selection
from custom_indicators.mrmr_accelerated import FCQSelector

selector = FCQSelector()
selector.fit(meta_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

  from .autonotebook import tqdm as notebook_tqdm


➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性...
✓ 选择第1个特征: 25m_natr_dt (最大F值: 370.9560)
➤ 计算特征冗余度...
➤ 总计选择2368个特征 (已选择1个，还需选择2367个)...
➤ 开始MRMR迭代选择过程...


选择特征: 100%|█████████████████████████████████████████████| 2367/2367 [2:16:00<00:00,  3.45s/特征]



✅ 特征选择完成：从11842个特征中选择了2368个，舍弃了9474个
✅ 选择的特征: ['25m_natr_dt', '2h_bandpass_dt_lag17', '25m_dft_dom_cycle_lag27', '2h_acc_swing_index_lag31', '25m_evenbetter_sinewave_long_ddt_lag15', '25m_ac_0', '2h_williams_r_dt_lag12', '25m_vwap_ddt_lag5', '25m_ehlers_early_onset_trend_ddt', '25m_natr', '10m_dft_dom_cycle_dt_lag14', '25m_adaptive_bp_lead_ddt_lag36', '10m_williams_r_ddt_lag27', '10m_homodyne_dt_lag31', '25m_natr_ddt', '25m_adaptive_rsi_ddt_lag25', '25m_hurst_coef_slow_ddt_lag17', '25m_dft_dom_cycle_ddt_lag7', '10m_ehlers_early_onset_trend_ddt_lag34', '10m_natr', '25m_dual_diff_ddt_lag22', '25m_ac_2', '10m_dual_diff_ddt_lag8', '10m_evenbetter_sinewave_long_dt_lag15', '25m_comb_spectrum_dom_cycle_ddt_lag32', '25m_natr_lag39', '10m_natr_dt', '2h_comb_spectrum_dom_cycle_dt_lag22', '25m_stc_dt', '25m_vwap_ddt_lag19', '10m_vwap_ddt_lag11', '25m_natr_lag1', '2h_comb_spectrum_dom_cycle_ddt_lag19', '25m_homodyne_ddt', '25m_natr_lag2', '25m_comb_spectrum_dom_cycle_ddt_lag17', '10m_pfe', '10m_d

In [11]:
import json

with open(f"data/meta_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(meta_res.to_dict(), f, indent=4)

In [17]:
import json

import pandas as pd

with open("data/meta_features_FCQSelector.json", "r") as f:
    meta_res = pd.Series(json.load(f))

In [19]:
meta_res.to_dict()["side_model_res"]

10.965977286582234

In [22]:
from pathlib import Path

feature_info = Path("custom_indicators/feature_info.json")
if feature_info.exists():
    with open(feature_info, "r") as f:
        feature_info = json.load(f)

SHORT_TERM = "10m"
MEDIUM_TERM = "25m"
LONG_TERM = "2h"

feature_info["meta"][SHORT_TERM] = []
feature_info["meta"][MEDIUM_TERM] = []
feature_info["meta"][LONG_TERM] = []
feature_info["meta"]["model_res"] = []

for k, v in meta_res[: len(meta_res) // 5].to_dict().items():
    if k.startswith(f"{SHORT_TERM}_"):
        feature_info["meta"][SHORT_TERM].append(k)
    elif k.startswith(f"{MEDIUM_TERM}_"):
        feature_info["meta"][MEDIUM_TERM].append(k)
    elif k.startswith(f"{LONG_TERM}_"):
        feature_info["meta"][LONG_TERM].append(k)
    else:
        feature_info["meta"]["model_res"].append(k)

feature_info["side"][SHORT_TERM] = sorted(list(set(feature_info["side"][SHORT_TERM])))
feature_info["side"][MEDIUM_TERM] = sorted(list(set(feature_info["side"][MEDIUM_TERM])))
feature_info["side"][LONG_TERM] = sorted(list(set(feature_info["side"][LONG_TERM])))

feature_info["meta"][SHORT_TERM] = sorted(list(set(feature_info["meta"][SHORT_TERM])))
feature_info["meta"][MEDIUM_TERM] = sorted(list(set(feature_info["meta"][MEDIUM_TERM])))
feature_info["meta"][LONG_TERM] = sorted(list(set(feature_info["meta"][LONG_TERM])))
feature_info["meta"]["model_res"] = sorted(list(set(feature_info["meta"]["model_res"])))

with open("custom_indicators/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)