# side model feature selection

In [1]:
import pandas as pd

df_features = pd.read_parquet("data/features_90m.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(29741, 11841)


5h_dft_dom_cycle_ddt_lag39    300
5h_dft_dom_cycle_ddt_lag38    296
5h_dft_dom_cycle_dt_lag39     296
5h_dft_dom_cycle_lag39        293
5h_dft_dom_cycle_ddt_lag37    293
                             ... 
90m_ac_3                        0
90m_conv_4                      0
90m_conv_3                      0
90m_conv_2                      0
90m_ac_36                       0
Length: 11841, dtype: int64

In [2]:
import numpy as np
from jesse import utils

label = np.load("data/label_side.npy")
df_label = utils.numpy_candles_to_dataframe(label[:, :6]).set_index("date")
df_label["side_label"] = label[:, 6]
df_label.head(1)

Unnamed: 0_level_0,open,close,high,low,volume,side_label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 02:56:00,7234.15,7111.0,7260.43,7101.0,110097.989,1.0


In [3]:
side_features = df_features.iloc[300:]
side_label = df_label.iloc[300:]

side_label["side_label"] = (side_label["side_label"].astype(int) == 1).astype(int)
side_label = side_label["side_label"].to_numpy()

print(side_features.shape)
print(side_label.shape)

side_features.isna().sum(axis=0).sort_values(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  side_label["side_label"] = (side_label["side_label"].astype(int) == 1).astype(int)


(29441, 11841)
(29441,)


5h_ac_0                     0
25m_ac_4                    0
90m_williams_r_ddt_lag35    0
90m_williams_r_ddt_lag36    0
90m_williams_r_ddt_lag37    0
                           ..
90m_ac_2                    0
90m_ac_3                    0
90m_ac_4                    0
90m_ac_5                    0
25m_williams_r_ddt_lag39    0
Length: 11841, dtype: int64

In [5]:
from custom_indicators.toolbox.feature_selection.fcq_selector import FCQSelector

selector = FCQSelector(max_features=side_features.shape[1] // 4)
selector.fit(side_features, side_label)

  from .autonotebook import tqdm as notebook_tqdm


➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性...
✓ 选择第1个特征: 90m_williams_r (最大F值: 14113.9619)
➤ 计算特征冗余度...
➤ 总计选择2960个特征 (已选择1个，还需选择2959个)...
➤ 开始MRMR迭代选择过程...


选择特征: 100%|███████████████████████████████████████████████| 2959/2959 [29:45<00:00,  1.66特征/s]



✅ 特征选择完成：从11841个特征中选择了2960个，舍弃了8881个
✅ 选择的特征: ['90m_williams_r', '5h_evenbetter_sinewave_long_lag6', '25m_dft_spectrum_10', '90m_williams_r_lag1', '90m_acc_swing_index_dt', '90m_fisher_lag12', '90m_fisher', '25m_trendflex', '90m_trendflex', '5h_williams_r', '25m_williams_r', '90m_evenbetter_sinewave_short', '90m_williams_r_lag2', '90m_trendflex_dt', '90m_evenbetter_sinewave_long', '5h_ac_23', '5h_acc_swing_index_dt', '90m_fisher_lag1', '25m_williams_r_lag1', '90m_trendflex_lag1', '90m_natr_dt', '25m_evenbetter_sinewave_long', '90m_stc', '25m_fisher', '25m_trendflex_lag1', '90m_evenbetter_sinewave_long_lag1', '90m_voss', '90m_williams_r_lag3', '90m_acc_swing_index_dt_lag1', '90m_evenbetter_sinewave_short_lag1', '25m_williams_r_lag2', '25m_trendflex_lag2', '90m_fisher_lag2', '25m_evenbetter_sinewave_long_lag1', '90m_vwap_dt', '90m_fisher_dt', '5h_adaptive_bp_lead_lag28', '90m_trendflex_lag2', '25m_fisher_lag1', '5h_williams_r_dt', '5h_phase_accumulation_ddt_lag25', '25m_evenbetter_sinew

<custom_indicators.toolbox.feature_selection.fcq_selector.FCQSelector at 0x158b61210>

In [8]:
side_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
side_res

90m_williams_r                   1.411396e+04
90m_williams_r_lag1              8.453459e+03
90m_fisher                       6.899939e+03
90m_trendflex                    6.514529e+03
5h_williams_r                    6.105675e+03
                                     ...     
25m_mod_rsi_ddt_lag34            1.221067e-06
90m_adaptive_bp_lag27            8.027391e-07
90m_voss_ddt_lag13               4.287504e-07
90m_adaptive_bp_lead_ddt_lag5    2.214192e-07
25m_adaptive_rsi_ddt_lag37       1.402542e-07
Length: 11841, dtype: float64

In [9]:
import json

with open(f"data/side_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(side_res.to_dict(), f, indent=4)

In [10]:
import json

import pandas as pd

with open(f"data/side_features_FCQSelector.json", "r") as f:
    side_res = pd.Series(json.load(f))

In [11]:
DOLLAR_BAR_SHORT_TERM = "25m"
DOLLAR_BAR_MEDIUM_TERM = "90m"
DOLLAR_BAR_LONG_TERM = "5h"

feature_info = {
    "side": {
        DOLLAR_BAR_SHORT_TERM: [],
        DOLLAR_BAR_MEDIUM_TERM: [],
        DOLLAR_BAR_LONG_TERM: [],
    },
    "meta": {
        DOLLAR_BAR_SHORT_TERM: [],
        DOLLAR_BAR_MEDIUM_TERM: [],
        DOLLAR_BAR_LONG_TERM: [],
        "model_res": [],
    },
}

for k, v in side_res[: len(side_res) // 4].to_dict().items():
    if k.startswith(f"{DOLLAR_BAR_SHORT_TERM}_"):
        feature_info["side"][DOLLAR_BAR_SHORT_TERM].append(k)
    elif k.startswith(f"{DOLLAR_BAR_MEDIUM_TERM}_"):
        feature_info["side"][DOLLAR_BAR_MEDIUM_TERM].append(k)
    elif k.startswith(f"{DOLLAR_BAR_LONG_TERM}_"):
        feature_info["side"][DOLLAR_BAR_LONG_TERM].append(k)

feature_info["side"][DOLLAR_BAR_SHORT_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_SHORT_TERM])),
)
feature_info["side"][DOLLAR_BAR_MEDIUM_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_MEDIUM_TERM])),
)
feature_info["side"][DOLLAR_BAR_LONG_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_LONG_TERM])),
)

feature_info["meta"][DOLLAR_BAR_SHORT_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_SHORT_TERM])),
)
feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM])),
)
feature_info["meta"][DOLLAR_BAR_LONG_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_LONG_TERM])),
)
feature_info["meta"]["model_res"] = sorted(
    list(set(feature_info["meta"]["model_res"])),
)

with open("model/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

from model.config import SIDE_ALL

df_features = pd.read_parquet("data/features_90m.parquet")
meta_label = np.load("data/label_meta.npy")

side_model = lgb.Booster(model_file="model/model_side.txt")
side_model_pred_prob = side_model.predict(df_features[SIDE_ALL])
df_features["side_model_res"] = side_model_pred_prob

assert df_features.shape[0] == meta_label.shape[0]
assert "side_model_res" in df_features.columns

df_features.isna().sum(axis=0).sort_values(ascending=False)

5h_dft_dom_cycle_ddt_lag39    300
5h_dft_dom_cycle_ddt_lag38    296
5h_dft_dom_cycle_dt_lag39     296
5h_dft_dom_cycle_lag39        293
5h_dft_dom_cycle_ddt_lag37    293
                             ... 
90m_comb_spectrum_pwr_14        0
90m_comb_spectrum_pwr_13        0
90m_comb_spectrum_pwr_12        0
90m_comb_spectrum_pwr_11        0
side_model_res                  0
Length: 11842, dtype: int64

In [2]:
meta_features = df_features[300:]
meta_label = meta_label[300:]

meta_label = pd.Series(meta_label[:, 6].astype(int), index=meta_features.index)

# meta_features = meta_features[meta_label["ret"].notna()]
# meta_label = meta_label[meta_label["ret"].notna()]["bin"]

print(meta_features.shape)
print(meta_label.shape)

meta_features.isna().sum(axis=0).sort_values(ascending=False)

(29441, 11842)
(29441,)


5h_ac_0                     0
25m_ac_4                    0
90m_williams_r_ddt_lag35    0
90m_williams_r_ddt_lag36    0
90m_williams_r_ddt_lag37    0
                           ..
90m_ac_2                    0
90m_ac_3                    0
90m_ac_4                    0
90m_ac_5                    0
side_model_res              0
Length: 11842, dtype: int64

In [3]:
# meta feature selection
from custom_indicators.toolbox.feature_selection.fcq_selector import FCQSelector

selector = FCQSelector(max_features=meta_features.shape[1] // 4)
selector.fit(meta_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

  from .autonotebook import tqdm as notebook_tqdm


➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性...
✓ 选择第1个特征: 5h_vwap_lag1 (最大F值: 68.0548)
➤ 计算特征冗余度...
➤ 总计选择2960个特征 (已选择1个，还需选择2959个)...
➤ 开始MRMR迭代选择过程...


选择特征: 100%|███████████████████████████████████████████████| 2959/2959 [30:28<00:00,  1.62特征/s]



✅ 特征选择完成：从11842个特征中选择了2960个，舍弃了8882个
✅ 选择的特征: ['5h_vwap_lag1', '90m_adaptive_cci_dt_lag20', '25m_hurst_coef_slow_dt_lag21', '90m_natr_ddt', '5h_voss_filt_dt_lag21', '25m_comb_spectrum_dom_cycle_dt_lag13', '25m_adaptive_cci_dt_lag25', '25m_hurst_coef_fast', '5h_dft_dom_cycle_dt_lag12', '5h_acp_pwr_5', '5h_voss_filt_ddt_lag30', '5h_hurst_coef_fast_lag34', '5h_evenbetter_sinewave_long_ddt_lag6', '25m_homodyne_dt_lag25', '90m_homodyne_ddt_lag1', '90m_dft_dom_cycle_dt_lag33', '25m_adaptive_bp_lead_dt_lag11', '5h_williams_r_dt_lag27', '5h_hurst_coef_slow_ddt_lag9', '90m_vwap_dt_lag16', '5h_comb_spectrum_pwr_18', '5h_vwap_ddt_lag18', '90m_vwap_ddt_lag9', '25m_evenbetter_sinewave_long', '90m_dual_diff_ddt_lag24', '25m_vwap_ddt_lag24', '25m_vwap', '90m_dft_dom_cycle_dt_lag19', '5h_dft_dom_cycle_ddt_lag7', '5h_pfe_lag34', '90m_natr_dt', '5h_comb_spectrum_dom_cycle_dt', '90m_stc_ddt_lag39', '5h_adaptive_bp_ddt_lag15', '25m_dual_diff_ddt_lag3', '25m_acp_pwr_2', '90m_ehlers_early_onset_trend_ddt_l

In [4]:
import json

with open(f"data/meta_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(meta_res.to_dict(), f, indent=4)

In [5]:
import json

import pandas as pd

with open("data/meta_features_FCQSelector.json", "r") as f:
    meta_res = pd.Series(json.load(f))

In [6]:
meta_res.to_dict()["side_model_res"]

17.72146717089685

In [7]:
from pathlib import Path

feature_info = Path("model/feature_info.json")
if feature_info.exists():
    with open(feature_info, "r") as f:
        feature_info = json.load(f)

DOLLAR_BAR_SHORT_TERM = "25m"
DOLLAR_BAR_MEDIUM_TERM = "90m"
DOLLAR_BAR_LONG_TERM = "5h"


feature_info["meta"][DOLLAR_BAR_SHORT_TERM] = []
feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM] = []
feature_info["meta"][DOLLAR_BAR_LONG_TERM] = []
feature_info["meta"]["model_res"] = []

for k, v in meta_res[: len(meta_res) // 4].to_dict().items():
    if k.startswith(f"{DOLLAR_BAR_SHORT_TERM}_"):
        feature_info["meta"][DOLLAR_BAR_SHORT_TERM].append(k)
    elif k.startswith(f"{DOLLAR_BAR_MEDIUM_TERM}_"):
        feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM].append(k)
    elif k.startswith(f"{DOLLAR_BAR_LONG_TERM}_"):
        feature_info["meta"][DOLLAR_BAR_LONG_TERM].append(k)
    else:
        feature_info["meta"]["model_res"].append(k)

feature_info["side"][DOLLAR_BAR_SHORT_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_SHORT_TERM])),
)
feature_info["side"][DOLLAR_BAR_MEDIUM_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_MEDIUM_TERM])),
)
feature_info["side"][DOLLAR_BAR_LONG_TERM] = sorted(
    list(set(feature_info["side"][DOLLAR_BAR_LONG_TERM])),
)

feature_info["meta"][DOLLAR_BAR_SHORT_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_SHORT_TERM])),
)
feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_MEDIUM_TERM])),
)
feature_info["meta"][DOLLAR_BAR_LONG_TERM] = sorted(
    list(set(feature_info["meta"][DOLLAR_BAR_LONG_TERM])),
)
feature_info["meta"]["model_res"] = sorted(
    list(set(feature_info["meta"]["model_res"])),
)

with open("model/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)