# side model feature selection

In [1]:
import pandas as pd

df_features = pd.read_parquet("data/features_1h.parquet")
print(df_features.shape)
df_features.isna().sum(axis=0).sort_values(ascending=False)

(44867, 11841)


4h_dft_dom_cycle_ddt_lag39    364
4h_dft_dom_cycle_ddt_lag38    360
4h_dft_dom_cycle_dt_lag39     360
4h_dft_dom_cycle_ddt_lag37    356
4h_dft_dom_cycle_lag39        356
                             ... 
1h_ac_23                        0
1h_ac_22                        0
1h_mod_stochastic               0
1h_trendflex                    0
1h_ac_31                        0
Length: 11841, dtype: int64

In [2]:
import numpy as np
from jesse import utils

label = np.load("data/label_side_1h.npy")
df_label = utils.numpy_candles_to_dataframe(label[:, :6]).set_index("date")
df_label["side_label"] = label[:, 6]
df_label.head(1)

Unnamed: 0_level_0,open,close,high,low,volume,side_label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 18:02:00,7235.27,7134.15,7242.0,7101.0,70686.353,-1.0


In [3]:
side_features = df_features.iloc[364:]
side_label = df_label.iloc[364:]

side_label["side_label"] = (side_label["side_label"].astype(int) == 1).astype(int)
side_label = side_label["side_label"].to_numpy()

print(side_features.shape)
print(side_label.shape)

side_features.isna().sum(axis=0).sort_values(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  side_label["side_label"] = (side_label["side_label"].astype(int) == 1).astype(int)


(44503, 11841)
(44503,)


15m_ac_0                   0
4h_ac_4                    0
1h_williams_r_ddt_lag35    0
1h_williams_r_ddt_lag36    0
1h_williams_r_ddt_lag37    0
                          ..
1h_ac_2                    0
1h_ac_3                    0
1h_ac_4                    0
1h_ac_5                    0
4h_williams_r_ddt_lag39    0
Length: 11841, dtype: int64

In [4]:
from custom_indicators.toolbox.feature_selection.fcq_selector import (
    FCQSelector,
)

selector = FCQSelector()
selector.fit(side_features, side_label)

  from .autonotebook import tqdm as notebook_tqdm


➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性...
✓ 选择第1个特征: 1h_williams_r (最大F值: 18058.8125)
➤ 计算特征冗余度...
➤ 总计选择2368个特征 (已选择1个，还需选择2367个)...
➤ 开始MRMR迭代选择过程...


选择特征:  33%|███████████████▏                              | 784/2367 [53:08<1:47:17,  4.07s/特征]


KeyboardInterrupt: 

In [5]:
side_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)
side_res

1h_williams_r                       2.167859e+04
1h_acc_swing_index_dt               1.120475e+04
1h_trendflex_dt                     9.125613e+03
1h_williams_r_lag1                  8.152869e+03
15m_williams_r                      8.096263e+03
                                        ...     
15m_adaptive_stochastic_dt_lag25    3.684977e-06
1h_hurst_coef_slow_dt_lag37         3.143688e-06
15m_adaptive_stochastic_dt_lag32    1.456168e-06
15m_hurst_coef_fast_lag34           9.868833e-07
15m_hurst_coef_slow_lag4            5.224809e-07
Length: 11841, dtype: float64

In [6]:
import json

with open(f"data/side_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(side_res.to_dict(), f, indent=4)

In [7]:
import json

import pandas as pd

with open(f"data/side_features_FCQSelector.json", "r") as f:
    side_res = pd.Series(json.load(f))

In [8]:
SHORT_TERM = "15m"
MEDIUM_TERM = "1h"
LONG_TERM = "4h"

feature_info = {
    "side": {SHORT_TERM: [], MEDIUM_TERM: [], LONG_TERM: []},
    "meta": {SHORT_TERM: [], MEDIUM_TERM: [], LONG_TERM: [], "model_res": []},
}

for k, v in side_res[: len(side_res) // 5].to_dict().items():
    if k.startswith(f"{SHORT_TERM}_"):
        feature_info["side"][SHORT_TERM].append(k)
    elif k.startswith(f"{MEDIUM_TERM}_"):
        feature_info["side"][MEDIUM_TERM].append(k)
    elif k.startswith(f"{LONG_TERM}_"):
        feature_info["side"][LONG_TERM].append(k)

feature_info["side"][SHORT_TERM] = sorted(list(set(feature_info["side"][SHORT_TERM])))
feature_info["side"][MEDIUM_TERM] = sorted(list(set(feature_info["side"][MEDIUM_TERM])))
feature_info["side"][LONG_TERM] = sorted(list(set(feature_info["side"][LONG_TERM])))

feature_info["meta"][SHORT_TERM] = sorted(list(set(feature_info["meta"][SHORT_TERM])))
feature_info["meta"][MEDIUM_TERM] = sorted(list(set(feature_info["meta"][MEDIUM_TERM])))
feature_info["meta"][LONG_TERM] = sorted(list(set(feature_info["meta"][LONG_TERM])))
feature_info["meta"]["model_res"] = sorted(list(set(feature_info["meta"]["model_res"])))

with open("custom_indicators/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)

# meta model feature selection

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

from custom_indicators.config import SIDE_ALL

df_features = pd.read_parquet("data/features_1h.parquet")
meta_label = np.load("data/label_meta_1h.npy")

side_model = lgb.Booster(model_file="custom_indicators/models/model_side.txt")
side_model_pred_prob = side_model.predict(df_features[SIDE_ALL])
df_features["side_model_res"] = side_model_pred_prob

assert df_features.shape[0] == meta_label.shape[0]
assert "side_model_res" in df_features.columns

df_features.isna().sum(axis=0).sort_values(ascending=False)

4h_dft_dom_cycle_ddt_lag39    364
4h_dft_dom_cycle_ddt_lag38    360
4h_dft_dom_cycle_dt_lag39     360
4h_dft_dom_cycle_ddt_lag37    356
4h_dft_dom_cycle_lag39        356
                             ... 
1h_comb_spectrum_pwr_14         0
1h_comb_spectrum_pwr_13         0
1h_comb_spectrum_pwr_12         0
1h_comb_spectrum_pwr_11         0
side_model_res                  0
Length: 11842, dtype: int64

In [2]:
meta_features = df_features[364:]
meta_label = meta_label[364:]

meta_label = pd.Series(meta_label[:, 6].astype(int), index=meta_features.index)

# meta_features = meta_features[meta_label["ret"].notna()]
# meta_label = meta_label[meta_label["ret"].notna()]["bin"]

print(meta_features.shape)
print(meta_label.shape)

meta_label.value_counts()

(44503, 11842)
(44503,)


1    36948
0     7555
Name: count, dtype: int64

In [3]:
# meta feature selection
from custom_indicators.toolbox.feature_selection.fcq_selector_cython import (
    CythonFCQSelector,
)

selector = CythonFCQSelector()
selector.fit(meta_features, meta_label)
meta_res = pd.Series(selector.relevance_, index=selector.variables_).sort_values(
    ascending=False
)

  from .autonotebook import tqdm as notebook_tqdm


➤ 识别数值型变量...
➤ 计算特征与目标变量的相关性...
✓ 选择第1个特征: 1h_natr_dt (最大F值: 174.7976)
➤ 计算特征冗余度...
➤ 总计选择2368个特征 (已选择1个，还需选择2367个)...
➤ 开始MRMR迭代选择过程...


选择特征: 100%|█████████████████████████████████████████████| 2367/2367 [1:51:24<00:00,  2.82s/特征]



✅ 特征选择完成：从11842个特征中选择了2368个，舍弃了9474个
✅ 选择的特征: ['1h_natr_dt', '1h_adaptive_rsi_ddt_lag32', '1h_acc_swing_index', '15m_mod_rsi_ddt_lag21', '4h_evenbetter_sinewave_short_lag15', '4h_hurst_coef_slow_ddt_lag8', '1h_evenbetter_sinewave_long_dt_lag16', '1h_ac_0', '1h_hurst_coef_slow_ddt_lag35', '15m_forecast_oscillator_lag3', '4h_homodyne_ddt', '15m_hurst_coef_fast_dt_lag37', '15m_evenbetter_sinewave_short_ddt_lag7', '4h_acc_swing_index_ddt_lag38', '1h_williams_r_dt_lag16', '4h_hurst_coef_fast_ddt_lag39', '15m_williams_r_ddt_lag37', '1h_comb_spectrum_dom_cycle_dt_lag10', '1h_natr', '1h_natr_ddt', '15m_stc_ddt_lag30', '4h_dft_dom_cycle_ddt_lag8', '15m_conv_40', '4h_comb_spectrum_dom_cycle_dt_lag20', '15m_vwap', '15m_vwap_dt_lag15', '1h_stc_ddt_lag24', '4h_natr_ddt_lag20', '15m_voss_dt_lag18', '1h_dual_diff_dt_lag17', '4h_homodyne_ddt_lag27', '15m_comb_spectrum_dom_cycle_dt_lag37', '4h_williams_r_ddt_lag13', '15m_natr_dt', '1h_vwap_ddt_lag36', '1h_acc_swing_index_lag13', '4h_williams_r_ddt_lag

In [4]:
import json

with open(f"data/meta_features_{selector.__class__.__name__}.json", "w") as f:
    json.dump(meta_res.to_dict(), f, indent=4)

In [5]:
import json

import pandas as pd

with open("data/meta_features_FCQSelector.json", "r") as f:
    meta_res = pd.Series(json.load(f))

In [6]:
meta_res.to_dict()["side_model_res"]

67.03656945037658

In [7]:
from pathlib import Path

feature_info = Path("custom_indicators/feature_info.json")
if feature_info.exists():
    with open(feature_info, "r") as f:
        feature_info = json.load(f)

SHORT_TERM = "15m"
MEDIUM_TERM = "1h"
LONG_TERM = "4h"

feature_info["meta"][SHORT_TERM] = []
feature_info["meta"][MEDIUM_TERM] = []
feature_info["meta"][LONG_TERM] = []
feature_info["meta"]["model_res"] = []

for k, v in meta_res[: len(meta_res) // 5].to_dict().items():
    if k.startswith(f"{SHORT_TERM}_"):
        feature_info["meta"][SHORT_TERM].append(k)
    elif k.startswith(f"{MEDIUM_TERM}_"):
        feature_info["meta"][MEDIUM_TERM].append(k)
    elif k.startswith(f"{LONG_TERM}_"):
        feature_info["meta"][LONG_TERM].append(k)
    else:
        feature_info["meta"]["model_res"].append(k)

feature_info["side"][SHORT_TERM] = sorted(list(set(feature_info["side"][SHORT_TERM])))
feature_info["side"][MEDIUM_TERM] = sorted(list(set(feature_info["side"][MEDIUM_TERM])))
feature_info["side"][LONG_TERM] = sorted(list(set(feature_info["side"][LONG_TERM])))

feature_info["meta"][SHORT_TERM] = sorted(list(set(feature_info["meta"][SHORT_TERM])))
feature_info["meta"][MEDIUM_TERM] = sorted(list(set(feature_info["meta"][MEDIUM_TERM])))
feature_info["meta"][LONG_TERM] = sorted(list(set(feature_info["meta"][LONG_TERM])))
feature_info["meta"]["model_res"] = sorted(list(set(feature_info["meta"]["model_res"])))

with open("custom_indicators/feature_info.json", "w") as f:
    json.dump(feature_info, f, indent=4)