In [1]:
# 初始化配置
import qlib
from qlib.constant import REG_CN
data_uri = '~/.qlib/qlib_data/cn_data/'
qlib.init(provider_uri=data_uri, region=REG_CN)

# 使用"配置"进行实例化
from qlib.utils import init_instance_by_config
qdl_config = {
    "class": "QlibDataLoader",
    "module_path": "qlib.data.dataset.loader",
    "kwargs": {
        "config": {
            "feature": (['EMA($close, 10)', 'EMA($close, 30)'], ['EMA10', 'EMA30'] ),
            "label": (['Ref($close, -1)/$close - 1',],['RET_1',]),
        },
        "freq": 'day',
    },
}
qdl = init_instance_by_config(qdl_config)
market = 'csi300' # 沪深300股票池代码，在instruments文件夹下有对应的sh000300.txt
qdl.load(instruments=market, start_time='20200101', end_time='20200110')

[125544:MainThread](2024-10-30 19:51:38,004) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[125544:MainThread](2024-10-30 19:51:38,203) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[125544:MainThread](2024-10-30 19:51:38,204) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/hhh/.qlib/qlib_data/cn_data')}


Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,EMA10,EMA30,RET_1
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2020-01-02,SH600000,9.171472,9.060854,0.010425
2020-01-02,SH600004,6.221383,6.198955,-0.007991
2020-01-02,SH600009,25.708969,25.645361,-0.000776
2020-01-02,SH600010,4.653900,4.625497,0.007519
2020-01-02,SH600011,2.117210,2.154742,0.000000
...,...,...,...,...
2020-01-10,SZ300347,17.478148,17.084019,-0.005612
2020-01-10,SZ300408,2.253388,2.172849,0.002635
2020-01-10,SZ300413,5.067583,4.742254,0.017136
2020-01-10,SZ300433,2.601543,2.478502,0.060436


In [2]:
# 实现一个自定义的特征集，MACD、RSI

from qlib.data.dataset.handler import DataHandlerLP

class MyFeatureSet(DataHandlerLP):
    def __init__(self,
                 instruments="csi300", 
                 start_time=None,
                 end_time=None,
                 freq="day",
                 infer_processors=[],
                 learn_processors=[],
                 fit_start_time=None,
                 fit_end_time=None,
                 process_type=DataHandlerLP.PTYPE_A,
                 filter_pipe=None,
                 **kwargs,
                ):
        data_loader = {
            "class": "QlibDataLoader",
            "kwargs": {
                "config": {
                    "feature": self.get_feature_config(),
                    "label": kwargs.get("label", self.get_label_config()), # label可以自定义，也可以使用初始化时候的设置
                },
                "filter_pipe": filter_pipe,
                "freq": freq,
                },
            }
        super().__init__(
            instruments=instruments,
            start_time=start_time,
            end_time=end_time,
            data_loader=data_loader,
            infer_processors=infer_processors,
            learn_processors=learn_processors,
            process_type=process_type,
        )
        
    def get_feature_config(self):
        
        MACD = '(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'
        RSI = '100 - 100/(1+(Sum(Greater($close-Ref($close, 1),0), 14)/Count(($close-Ref($close, 1))>0, 14))/ (Sum(Abs(Greater(Ref($close, 1)-$close,0)), 14)/Count(($close-Ref($close, 1))<0, 14)))'
        
        return [MACD, RSI ], ['MACD', 'RSI']

    def get_label_config(self):
        return (["Ref($close, -1)/$close - 1"], ["LABEL"])

# 初始化的过程中已经完成的数据的load
my_feature = MyFeatureSet(instruments='csi300', start_time='2020-01-01', end_time='2020-06-30')

# my_feature.get_feature_config()
my_feature.fetch() # my_feature.fetch(col_set='feature') / my_feature.fetch(col_set='label')

[125544:MainThread](2024-10-30 19:51:40,174) INFO - qlib.timer - [log.py:127] - Time cost: 1.143s | Loading data Done
[125544:MainThread](2024-10-30 19:51:40,175) INFO - qlib.timer - [log.py:127] - Time cost: 0.000s | fit & process data Done
[125544:MainThread](2024-10-30 19:51:40,175) INFO - qlib.timer - [log.py:127] - Time cost: 1.145s | Init data Done


Unnamed: 0_level_0,Unnamed: 1_level_0,MACD,RSI,LABEL
datetime,instrument,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,SH600000,0.006354,76.146812,0.010425
2020-01-02,SH600004,0.002323,54.615391,-0.007991
2020-01-02,SH600009,0.002592,44.651718,-0.000776
2020-01-02,SH600010,0.003610,64.705887,0.007519
2020-01-02,SH600011,-0.006209,46.551727,0.000000
...,...,...,...,...
2020-06-30,SZ300413,0.054595,81.474556,-0.039496
2020-06-30,SZ300433,0.062837,88.799995,0.053560
2020-06-30,SZ300498,0.048884,46.736053,0.023864
2020-06-30,SZ300601,0.031392,87.047409,-0.009254


In [3]:
import qlib
from qlib.constant import REG_CN
data_uri = '~/.qlib/qlib_data/cn_data/'
qlib.init(provider_uri=data_uri, region=REG_CN)
from qlib.data.dataset.handler import DataHandlerLP

# 使用"配置"进行实例化
from qlib.utils import init_instance_by_config
from qlib.contrib.data.handler import Alpha158
from qlib.data.dataset import TSDatasetH
from qlib.contrib.model.pytorch_alstm_ts import ALSTM

# 配置数据
train_period = ("2017-01-01", "2018-12-31")
valid_period = ("2019-01-01", "2019-12-31")
test_period = ("2020-01-01", "2020-08-01")

dh = Alpha158(instruments='csi300', 
              start_time=train_period[0], 
              end_time=test_period[1],
             fit_start_time = "2018-01-01",
             fit_end_time = "2019-12-31",
              infer_processors= [
                        {"class": "RobustZScoreNorm", "kwargs": {"fields_group": "feature", "clip_outlier": "true"}},
                        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
                    ],
            learn_processors= [
                        "DropnaLabel",
                        {"class": "CSRankNorm", "kwargs": {"fields_group": "label"}},  # CSRankNorm
                    ],
             )
ds = TSDatasetH(handler=dh,
                step_len=20, # 时间步数
                segments={"train": train_period, 
                          "valid": valid_period, 
                          "test": test_period})

# 配置模型
# model = ALSTM(d_feat=158, 
#               metric='loss', 
#               rnn_type='GRU', 
#               batch_size=800, 
#               early_stop=10) # 其他参数使用默认设置


# 模型训练, 使用fit方法
# model.fit(dataset=ds,
#           save_path=None) # 保存模型的路径，默认存在当前路径

[125544:MainThread](2024-10-30 19:51:40,195) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[125544:MainThread](2024-10-30 19:51:40,198) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[125544:MainThread](2024-10-30 19:51:40,198) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/hhh/.qlib/qlib_data/cn_data')}


ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)
ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).
-------------DropnaLabel: label


  from .autonotebook import tqdm as notebook_tqdm
[125544:MainThread](2024-10-30 19:51:54,838) INFO - qlib.timer - [log.py:127] - Time cost: 13.310s | Loading data Done
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
[125544:MainThread](2024-10-30 19:51:59,066) INFO - qlib.timer - [log.py:127] - Time cost: 4.134s | RobustZScoreNorm Done
[125544:MainThread](2024-10-30 19:51:59,203) INFO - qlib.timer - [log.py:127] - Time cost: 0.137s | Fillna Done
[125544:MainThread](2024-10-30 19:51:59,290) INFO - qlib.timer - [log.py:127] - Time cost: 0.055s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = t
[125544:MainThread](2024-10-30 19:51:59,372) INFO - qlib.timer - [log.py:127] - Time cost: 0.081s | CSRankNorm Done
[125544:MainThr

In [4]:
ds.prepare("train")

<qlib.data.dataset.TSDataSampler at 0x74de6fd1ff90>

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
#%%
from qlib.data.dataset.handler import DataHandlerLP
dl_train = ds.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)

In [6]:
dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader

In [7]:
train_loader = DataLoader(
dl_train, batch_size=8192, shuffle=True, num_workers=0, drop_last=True
)

In [8]:
for batch_idx, inputs in enumerate(train_loader):  
    if batch_idx == 1:  
        one_batch_data = inputs
        break   


In [9]:
# 一个batch的数据
one_batch_data.shape

torch.Size([8192, 20, 159])

In [10]:

# feature: (8192,20,158)
feature = one_batch_data[:, :, 0:-1]
# 8192
label = one_batch_data[:, -1, -1]
print(feature.shape)
print(label.shape)


torch.Size([8192, 20, 158])
torch.Size([8192])


In [11]:
feature.shape[2]

158

In [12]:
# import importlib.reload  
from transformerModule import PositionalEncoding, Attention, FeedForwardNetwork,AddNormLayer
# from transformerModule import PositionalEncoding, Attention, FeedForwardNetwork

num_features = feature.shape[2]
max_len = feature.shape[0]
instance = PositionalEncoding(num_features, max_len)
addNorm = AddNormLayer(num_features, dropout=0.1)
attn = Attention(num_features, dropout=0.1)

In [15]:

posi_out=instance(feature.float())
addNorm_out = addNorm(posi_out, feature.float())
attn_out=attn(addNorm_out)

In [None]:
feedForward = FeedForwardNetwork(num_features)
feed_out = feedForward(attn_out)
# loss = nn.MSELoss()
# optimizer = optim.Adam(feedForward.parameters(), lr=0.001)

# for epoch in range(100):
#     optimizer.zero_grad()
#     pred = feedForward.forward(attn_out)
#     loss_val = loss(pred, target)
#     loss_val.backward()
#     optimizer.step()
#     if epoch % 10 == 0:
#         print(f"Epoch {epoch+1}: Loss {loss_val.item():.4f}")      

In [None]:
feed_out.shape

In [None]:
label