# meta model

In [6]:
import pandas as pd

df_feature = pd.read_parquet("data/features_d002.parquet")
df_label = pd.read_parquet("data/label_d002.parquet")
df_label.head()

Unnamed: 0_level_0,close,ret,trgt,bin,side
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01 00:00:00+00:00,29016.23,,,0.0,0.0
2021-01-01 00:03:00+00:00,28965.02,,,0.0,0.0
2021-01-01 00:06:00+00:00,28928.06,,,0.0,0.0
2021-01-01 00:09:00+00:00,28851.82,,,0.0,0.0
2021-01-01 00:12:00+00:00,28786.75,,,0.0,0.0


In [27]:
import json

with open("data/mrmr_meta_features.json", "r") as f:
    mrmr_meta_features = json.load(f)

meta_features_col = [k for k, v in mrmr_meta_features.items() if v > 50]
len(meta_features_col)

408

In [12]:
invalid_len = 300
train_split_point = int(len(df_feature) * 0.8)
all_feature_col = list(mrmr_meta_features.keys())

df_feature = df_feature.iloc[invalid_len:]
df_label = df_label.iloc[invalid_len:]

meta_train_feature = df_feature.iloc[:train_split_point][all_feature_col]
meta_train_label = df_label.iloc[:train_split_point]
meta_train_feature = meta_train_feature[meta_train_label["ret"].notna().tolist()]
meta_train_label = meta_train_label[meta_train_label["ret"].notna().tolist()][
    "bin"
].astype(int)
assert meta_train_feature.shape[0] == meta_train_label.shape[0]
print(meta_train_feature.shape)

meta_test_feature = df_feature.iloc[train_split_point:][all_feature_col]
meta_test_label = df_label.iloc[train_split_point:]
meta_test_feature = meta_test_feature[meta_test_label["ret"].notna().tolist()]
meta_test_label = meta_test_label[meta_test_label["ret"].notna().tolist()][
    "bin"
].astype(int)
assert meta_test_feature.shape[0] == meta_test_label.shape[0]
print(meta_test_feature.shape)

meta_test_label.value_counts()

(427061, 1263)
(106659, 1263)


bin
1    57018
0    49641
Name: count, dtype: int64

In [28]:
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

AUC = 0
for idx, col in enumerate(mrmr_meta_features.keys()):
    if col in meta_features_col:
        continue
    else:
        meta_features_col.append(col)
    train_data = lgb.Dataset(meta_train_feature[meta_features_col], meta_train_label)
    test_data = lgb.Dataset(meta_test_feature[meta_features_col], meta_test_label)
    params = {
        "objective": "binary",
        "metric": "auc",
        "num_threads": -1,
        "num_leaves": 200,
        "max_depth": 30,
        "early_stopping_round": 50,
        "verbose": -1,
    }
    model = lgb.train(params, train_data, valid_sets=test_data, num_boost_round=500)
    pred_proba = model.predict(meta_test_feature[meta_features_col])
    pred_label = (pred_proba > 0.5).astype(int)
    precision = precision_score(meta_test_label, pred_label)
    recall = recall_score(meta_test_label, pred_label)
    f1 = f1_score(meta_test_label, pred_label)
    auc = roc_auc_score(meta_test_label, pred_proba)
    print(
        f"{idx}: {col} added. precision: {precision}, recall: {recall}, f1: {f1}, auc: {auc}, change: {auc-AUC}"
    )
    AUC = auc

408: 15m_conv_2 added. precision: 0.6487416437278805, recall: 0.6944122908555194, f1: 0.6708005082592122, auc: 0.6840473129462823, change: 0.6840473129462823
409: 15m_acp_pwr_27 added. precision: 0.6479618246522161, recall: 0.7025325335858852, f1: 0.6741446338713206, auc: 0.6867247116311321, change: 0.002677398684849841
410: 15m_ac_25 added. precision: 0.6502012203037777, recall: 0.702725455119436, f1: 0.6754437720200266, auc: 0.6882341662680295, change: 0.0015094546368973338
411: 15m_bandpass_lag3 added. precision: 0.6494538232373386, recall: 0.6996737872250868, f1: 0.6736291105576427, auc: 0.6867401644039215, change: -0.0014940018641079744
412: 15m_swamicharts_stochastic_16 added. precision: 0.6471822348460397, recall: 0.7033217580413202, f1: 0.6740851557378427, auc: 0.6858494431987364, change: -0.0008907212051850744
413: 15m_acp_pwr_15 added. precision: 0.6479411095583217, recall: 0.6993054824792171, f1: 0.6726441512871555, auc: 0.6864709553596542, change: 0.0006215121609177698
414:

KeyboardInterrupt: 

# side models

In [None]:
import json

with open("data/mrmr_side_features.json", "r") as f:
    mrmr_side_features = json.load(f)

side_features_col = [k for k, v in mrmr_side_features.items() if v > 10]
len(side_features_col)

In [None]:
invalid_len = 300
train_split_point = int(len(df_feature) * 0.8)
all_feature_col = list(mrmr_side_features.keys())

side_train_feature = df_feature.iloc[:train_split_point]
side_train_label = df_label.iloc[:train_split_point]
side_train_feature = side_train_feature[
    (side_train_label["side"].astype(int) != 0).tolist()
]
side_train_label = side_train_label[
    (side_train_label["side"].astype(int) != 0).tolist()
]["side"].astype(int)
side_train_label = (side_train_label == 1).astype(int)
assert side_train_feature.shape[0] == side_train_label.shape[0]
print(side_train_feature.shape)

side_test_feature = df_feature.iloc[train_split_point:]
side_test_label = df_label.iloc[train_split_point:]
side_test_feature = side_test_feature[
    (side_test_label["side"].astype(int) != 0).tolist()
]
side_test_label = side_test_label[(side_test_label["side"].astype(int) != 0).tolist()][
    "side"
].astype(int)
side_test_label = (side_test_label == 1).astype(int)
assert side_test_feature.shape[0] == side_test_label.shape[0]
print(side_test_feature.shape)
side_test_label.value_counts()

(427148, 1263)
(106939, 1263)
(220169, 1263)
(57137, 1263)
