In [486]:
import os
import random
import re

import matplotlib.patches as mpathes
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from matplotlib import ticker
from numba import jit
from tqdm import tqdm

import sklearn
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.tab_model import TabNetRegressor
from xgboost import XGBRegressor

In [6]:
from PqiDataSdk import PqiDataSdk

ds = PqiDataSdk(user="zyding", size=128, pool_type="mt")

In [20]:
tickers_zz500 = ds.get_index_weight(ticker='000905').StockTicker.values
tickers_zz1000 = ds.get_index_weight(ticker='000852').StockTicker.values
tickers_hs300 = ds.get_index_weight(ticker='399300').StockTicker.values

tickers_zz1800 = np.concatenate([tickers_hs300, tickers_zz500, tickers_zz1000])

In [102]:
tickers = ds.get_ticker_list(date='all')
start_date = '20170101'
end_date = '20191231'
lst_trade_date = ds.get_trade_dates(start_date=start_date, end_date=end_date)


path_FGv3 = '/home/zyding/factor_garden_v3/'
lst_FGv3 = [i[:-3] for i in os.listdir(path_FGv3+'eod_feature/')]
path_NSF = '/home/zyding/neuron_support_fac/'
lst_NSF = [i[:-3] for i in os.listdir(path_NSF+'eod_feature/')]

In [103]:
factor_data_FGv3 = ds.get_eod_feature(fields=lst_FGv3,
                                      where=path_FGv3,
                                      tickers=list(tickers_zz1800),
                                      dates=lst_trade_date)
factor_data_NSF = ds.get_eod_feature(fields=lst_NSF,
                                     where=path_NSF,
                                     tickers=list(tickers_zz1800),
                                     dates=lst_trade_date)

In [109]:
eod_data = ds.get_eod_history(fields=['OpenPrice', 'ClosePrice'], tickers=tickers_zz1800, start_date='20170101',
                              end_date='20191231', day_type='trade', price_mode='former')

In [174]:
df_ret_eod = eod_data['ClosePrice'].apply(lambda x: x/x.shift()-1, axis=1)

In [500]:
#y_train = df_ret_eod.stack(dropna=False)
y_train = df_ret_eod.shift(-1, axis=1).stack(dropna=False)
y_train.name = 'ret'

In [501]:
X_idx = factor_data_FGv3[0].to_dataframe().stack(dropna=False).index

In [502]:
def transform_to_2d(pqidata):
    shape = (pqidata.shape[1]*pqidata.shape[2], pqidata.shape[0])
    return pqidata.values.transpose((1, 2, 0)).reshape(shape)

In [503]:
X_data = np.concatenate([transform_to_2d(factor_data_FGv3),
                         transform_to_2d(factor_data_NSF)], axis=1)

In [504]:
col = [f'fac_FGv3_{i+1}' for i in range(factor_data_FGv3.shape[0])]+[
    f'fac_NSF_{i+1}' for i in range(factor_data_NSF.shape[0])]

In [505]:
%time X_train = pd.DataFrame(index=X_idx, columns=col, data=X_data)

CPU times: user 543 µs, sys: 49 µs, total: 592 µs
Wall time: 595 µs


In [506]:
X_train

Unnamed: 0,Unnamed: 1,fac_FGv3_1,fac_FGv3_2,fac_FGv3_3,fac_FGv3_4,fac_FGv3_5,fac_FGv3_6,fac_FGv3_7,fac_FGv3_8,fac_FGv3_9,fac_FGv3_10,...,fac_NSF_1041,fac_NSF_1042,fac_NSF_1043,fac_NSF_1044,fac_NSF_1045,fac_NSF_1046,fac_NSF_1047,fac_NSF_1048,fac_NSF_1049,fac_NSF_1050
600519,20170103,1.332,-0.000,0.000,-0.320,0.206,0.003,0.719,0.001,-0.002,0.082,...,-56079.912,0.731,0.000,-0.073,-171025.529,-3679384.943,-206417.000,3258770.004,0.045,2358919.481
600519,20170104,-3.225,-0.001,-0.000,-0.136,-0.062,0.002,0.831,0.001,-0.002,0.102,...,-9370937.056,0.923,0.962,5.524,482399.174,-14131792.189,789557.810,19937659.792,-0.708,13913257.710
600519,20170105,-0.068,-0.001,-0.000,0.029,-0.013,0.003,1.246,0.002,-0.002,0.093,...,-1446071.934,0.885,0.923,7.215,-301959.637,-4885990.260,-241860.840,5199351.114,-1.432,5572031.075
600519,20170106,1.969,-0.001,-0.000,-0.179,-0.084,0.004,1.352,0.002,-0.002,0.128,...,-3060222.447,0.346,0.385,1.142,-2755.505,-7124068.636,-551635.620,7397897.191,0.630,4625793.276
600519,20170109,1.136,-0.001,-0.000,-0.568,-0.090,0.005,0.998,0.000,-0.002,0.145,...,-1630085.876,0.808,0.385,-0.118,-204788.894,-3368545.452,-258023.500,2636398.288,-0.229,2045580.347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603530,20191225,-0.959,-0.001,-0.005,-0.023,-0.401,0.003,0.082,-0.000,-0.002,-0.022,...,95021.754,0.231,0.000,0.748,8555.185,-393739.500,14664.000,346885.200,-0.408,217248.852
603530,20191226,-1.075,-0.000,-0.004,-0.020,-0.452,0.008,-0.537,-0.001,-0.001,-0.024,...,260746.610,0.808,0.846,1.225,47512.037,-406219.700,-3188.710,448667.500,-0.063,294257.628
603530,20191227,-0.916,0.000,-0.004,0.204,-0.444,0.006,-0.816,-0.001,-0.002,-0.047,...,2272.402,0.885,0.923,5.103,38977.333,-808952.500,50256.000,552377.600,1.228,544449.792
603530,20191230,-0.294,0.000,-0.001,0.064,-0.440,0.005,-2.121,-0.001,-0.000,-0.031,...,107902.364,0.923,0.308,-0.313,29452.222,-267698.000,-6046.000,405074.100,0.331,198753.472


In [507]:
df_train = pd.concat([X_train, y_train], axis=1)

In [508]:
df_train = df_train[df_train.ret.notnull()]

In [509]:
df_train = df_train.loc[:, df_train.notnull().sum()/len(df_train) > 0.9]

In [510]:
df_train = df_train.dropna(thresh=0.9*df_train.shape[1]).fillna(0)

In [511]:
df_train = df_train.replace(np.inf, 0).replace(-np.inf, 0)

In [512]:
X_sub = df_train.drop('ret', axis=1)
y_sub = df_train['ret'].values

In [513]:
X_sub = StandardScaler().fit(X_sub).transform(X_sub)

In [514]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_sub, y_sub, test_size=0.15, random_state=42)

In [530]:
tabnet_params = dict(
    n_d=128,  # 可以理解为用来决定输出的隐藏层神经元个数。n_d越大，拟合能力越强，也容易过拟合
    n_a=128,   # 可以理解为用来决定下一决策步特征选择的隐藏层神经元个数
    n_steps=4,  # 决策步的个数。可理解为决策树中分裂结点的次数
    gamma=1.2,  # 决定历史所用特征在当前决策步的特征选择阶段的权重，gamma=1时，表示每个特征在所有决策步中至多仅出现1次
    lambda_sparse=1e-3,  # 稀疏正则项权重，用来对特征选择阶段的特征稀疏性添加约束,越大则特征选择越稀疏
    optimizer_fn=torch.optim.Adam,  # 优化器
    optimizer_params=dict(lr=1e-3, weight_decay=1e-5),
    mask_type="entmax",
    seed=42
)

In [531]:
reg_tabnet = TabNetRegressor(**tabnet_params)

Device used : cuda


In [532]:
reg_tabnet.fit(
    X_train_sub, y_train_sub.reshape(-1, 1),
    eval_set=[(X_val, y_val.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=300,  # 最大迭代次数
    patience=10,    # 在验证集上早停次数，
    batch_size=512,  # BN作用在的输入特征batch
    virtual_batch_size=512,  # 除了作用于模型输入特征的第一层BN外，都是用的是ghost BN。
    drop_last=False
)

epoch 0  | loss: 0.71494 | val_0_rmse: 0.04027 |  0:01:31s
epoch 1  | loss: 0.00307 | val_0_rmse: 0.02961 |  0:03:03s
epoch 2  | loss: 0.00168 | val_0_rmse: 0.02928 |  0:04:33s
epoch 3  | loss: 0.00113 | val_0_rmse: 0.02912 |  0:06:05s
epoch 4  | loss: 0.00093 | val_0_rmse: 0.02916 |  0:07:36s
epoch 5  | loss: 0.00089 | val_0_rmse: 0.0291  |  0:09:07s
epoch 6  | loss: 0.00088 | val_0_rmse: 0.02909 |  0:10:38s
epoch 7  | loss: 0.00087 | val_0_rmse: 0.02909 |  0:12:09s
epoch 8  | loss: 0.00087 | val_0_rmse: 0.02909 |  0:13:41s
epoch 9  | loss: 0.00086 | val_0_rmse: 0.02909 |  0:15:11s
epoch 10 | loss: 0.00086 | val_0_rmse: 0.02908 |  0:16:42s
epoch 11 | loss: 0.00086 | val_0_rmse: 0.02908 |  0:18:13s
epoch 12 | loss: 0.00086 | val_0_rmse: 0.02908 |  0:19:44s
epoch 13 | loss: 0.00086 | val_0_rmse: 0.02908 |  0:21:16s
epoch 14 | loss: 0.00086 | val_0_rmse: 0.02909 |  0:22:47s
epoch 15 | loss: 0.00086 | val_0_rmse: 0.02908 |  0:24:19s
epoch 16 | loss: 0.00086 | val_0_rmse: 0.02909 |  0:25:5

In [516]:
xgb_params = dict(max_depth=8,
                  learning_rate=0.1,
                  eta=0.1,
                  n_estimators=1000,
                  silent=None,
                  objective='reg:squarederror',
                  booster='gbtree',
                  n_jobs=-1,
                  nthread=None,
                  gamma=0,
                  min_child_weight=1,
                  max_delta_step=0,
                  subsample=0.8,
                  colsample_bytree=1,
                  colsample_bylevel=1,
                  colsample_bynode=1,
                  scale_pos_weight=1,
                  base_score=0.5,
                  random_state=42)

In [521]:
reg_xgb = XGBRegressor(**xgb_params)

In [522]:
reg_xgb.fit(X_train_sub, y_train_sub.reshape(-1, 1),
            eval_set=[(X_val, y_val.reshape(-1, 1))],
            early_stopping_rounds=40,
            verbose=10)

[0]	validation_0-rmse:0.45114
[10]	validation_0-rmse:0.15960
[20]	validation_0-rmse:0.06183
[30]	validation_0-rmse:0.03449
[40]	validation_0-rmse:0.02946
[50]	validation_0-rmse:0.02876
[60]	validation_0-rmse:0.02866
[70]	validation_0-rmse:0.02862
[80]	validation_0-rmse:0.02861
[90]	validation_0-rmse:0.02860
[100]	validation_0-rmse:0.02857
[110]	validation_0-rmse:0.02855
[120]	validation_0-rmse:0.02855
[130]	validation_0-rmse:0.02855
[140]	validation_0-rmse:0.02854
[150]	validation_0-rmse:0.02854
[160]	validation_0-rmse:0.02854
[170]	validation_0-rmse:0.02852
[180]	validation_0-rmse:0.02851
[190]	validation_0-rmse:0.02851
[200]	validation_0-rmse:0.02851
[210]	validation_0-rmse:0.02851
[220]	validation_0-rmse:0.02851
[230]	validation_0-rmse:0.02851
[240]	validation_0-rmse:0.02850
[250]	validation_0-rmse:0.02850
[260]	validation_0-rmse:0.02849
[270]	validation_0-rmse:0.02849
[280]	validation_0-rmse:0.02849
[290]	validation_0-rmse:0.02849
[300]	validation_0-rmse:0.02849
[310]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, nthread=256, num_parallel_tree=1,
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             silent=None, subsample=0.8, tree_method='exact',
             validate_parameters=1, verbosity=None)