In [17]:
import numpy as np
import torch.nn as nn
import torch
import math
seed = 32
np.random.seed(seed) 
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x78cd1d0e10d0>

In [1]:
# 初始化配置
import qlib
from qlib.constant import REG_CN
data_uri = '~/.qlib/qlib_data/cn_data/'
qlib.init(provider_uri=data_uri, region=REG_CN)

# 使用"配置"进行实例化
from qlib.utils import init_instance_by_config
qdl_config = {
    "class": "QlibDataLoader",
    "module_path": "qlib.data.dataset.loader",
    "kwargs": {
        "config": {
            "feature": (['EMA($close, 10)', 'EMA($close, 30)'], ['EMA10', 'EMA30'] ),
            "label": (['Ref($close, -1)/$close - 1',],['RET_1',]),
        },
        "freq": 'day',
    },
}
qdl = init_instance_by_config(qdl_config)
market = 'csi300' # 沪深300股票池代码，在instruments文件夹下有对应的sh000300.txt
qdl.load(instruments=market, start_time='20200101', end_time='20200110')

[235289:MainThread](2024-11-13 21:35:54,387) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[235289:MainThread](2024-11-13 21:35:54,583) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[235289:MainThread](2024-11-13 21:35:54,584) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/hhh/.qlib/qlib_data/cn_data')}


Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,EMA10,EMA30,RET_1
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2020-01-02,SH600000,9.171472,9.060854,0.010425
2020-01-02,SH600004,6.221383,6.198955,-0.007991
2020-01-02,SH600009,25.708969,25.645361,-0.000776
2020-01-02,SH600010,4.653900,4.625497,0.007519
2020-01-02,SH600011,2.117210,2.154742,0.000000
...,...,...,...,...
2020-01-10,SZ300347,17.478148,17.084019,-0.005612
2020-01-10,SZ300408,2.253388,2.172849,0.002635
2020-01-10,SZ300413,5.067583,4.742254,0.017136
2020-01-10,SZ300433,2.601543,2.478502,0.060436


In [2]:
# 实现一个自定义的特征集，MACD、RSI

from qlib.data.dataset.handler import DataHandlerLP

class MyFeatureSet(DataHandlerLP):
    def __init__(self,
                 instruments="csi300", 
                 start_time=None,
                 end_time=None,
                 freq="day",
                 infer_processors=[],
                 learn_processors=[],
                 fit_start_time=None,
                 fit_end_time=None,
                 process_type=DataHandlerLP.PTYPE_A,
                 filter_pipe=None,
                 **kwargs,
                ):
        data_loader = {
            "class": "QlibDataLoader",
            "kwargs": {
                "config": {
                    "feature": self.get_feature_config(),
                    "label": kwargs.get("label", self.get_label_config()), # label可以自定义，也可以使用初始化时候的设置
                },
                "filter_pipe": filter_pipe,
                "freq": freq,
                },
            }
        super().__init__(
            instruments=instruments,
            start_time=start_time,
            end_time=end_time,
            data_loader=data_loader,
            infer_processors=infer_processors,
            learn_processors=learn_processors,
            process_type=process_type,
        )
        
    def get_feature_config(self):
        
        MACD = '(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'
        RSI = '100 - 100/(1+(Sum(Greater($close-Ref($close, 1),0), 14)/Count(($close-Ref($close, 1))>0, 14))/ (Sum(Abs(Greater(Ref($close, 1)-$close,0)), 14)/Count(($close-Ref($close, 1))<0, 14)))'
        
        return [MACD, RSI ], ['MACD', 'RSI']

    def get_label_config(self):
        return (["Ref($close, -1)/$close - 1"], ["LABEL"])

# 初始化的过程中已经完成的数据的load
my_feature = MyFeatureSet(instruments='csi300', start_time='2020-01-01', end_time='2020-06-30')

# my_feature.get_feature_config()
my_feature.fetch() # my_feature.fetch(col_set='feature') / my_feature.fetch(col_set='label')

[235289:MainThread](2024-11-13 21:35:56,399) INFO - qlib.timer - [log.py:127] - Time cost: 1.053s | Loading data Done
[235289:MainThread](2024-11-13 21:35:56,400) INFO - qlib.timer - [log.py:127] - Time cost: 0.000s | fit & process data Done
[235289:MainThread](2024-11-13 21:35:56,400) INFO - qlib.timer - [log.py:127] - Time cost: 1.054s | Init data Done


Unnamed: 0_level_0,Unnamed: 1_level_0,MACD,RSI,LABEL
datetime,instrument,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,SH600000,0.006354,76.146812,0.010425
2020-01-02,SH600004,0.002323,54.615391,-0.007991
2020-01-02,SH600009,0.002592,44.651718,-0.000776
2020-01-02,SH600010,0.003610,64.705887,0.007519
2020-01-02,SH600011,-0.006209,46.551727,0.000000
...,...,...,...,...
2020-06-30,SZ300413,0.054595,81.474556,-0.039496
2020-06-30,SZ300433,0.062837,88.799995,0.053560
2020-06-30,SZ300498,0.048884,46.736053,0.023864
2020-06-30,SZ300601,0.031392,87.047409,-0.009254


In [1]:

import qlib
from qlib.constant import REG_CN
data_uri = '~/.qlib/qlib_data/cn_data/'
qlib.init(provider_uri=data_uri, region=REG_CN)
from qlib.data.dataset.handler import DataHandlerLP

# 使用"配置"进行实例化
from qlib.utils import init_instance_by_config
from qlib.contrib.data.handler import Alpha158
from qlib.data.dataset import TSDatasetH
from qlib.contrib.model.pytorch_alstm_ts import ALSTM

# 配置数据
train_period = ("2017-01-01", "2018-12-31")
valid_period = ("2019-01-01", "2019-12-31")
test_period = ("2020-01-01", "2020-08-01")

dh = Alpha158(instruments='csi300', 
              start_time=train_period[0], 
              end_time=test_period[1],
             fit_start_time = "2018-01-01",
             fit_end_time = "2019-12-31",
              infer_processors= [
                        {"class": "RobustZScoreNorm", "kwargs": {"fields_group": "feature", "clip_outlier": "true"}},
                        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
                    ],
            learn_processors= [
                        "DropnaLabel",
                        {"class": "CSRankNorm", "kwargs": {"fields_group": "label"}},  # CSRankNorm
                    ],
             )
ds = TSDatasetH(handler=dh,
                step_len=20, # 时间步数
                segments={"train": train_period, 
                          "valid": valid_period, 
                          "test": test_period})


[909548:MainThread](2024-11-21 20:43:32,462) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[909548:MainThread](2024-11-21 20:43:32,658) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[909548:MainThread](2024-11-21 20:43:32,659) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/hhh/.qlib/qlib_data/cn_data')}


ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)
ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).
-------------DropnaLabel: label


  from .autonotebook import tqdm as notebook_tqdm
[909548:MainThread](2024-11-21 20:43:47,110) INFO - qlib.timer - [log.py:127] - Time cost: 13.108s | Loading data Done
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
[909548:MainThread](2024-11-21 20:43:50,704) INFO - qlib.timer - [log.py:127] - Time cost: 3.514s | RobustZScoreNorm Done
[909548:MainThread](2024-11-21 20:43:50,834) INFO - qlib.timer - [log.py:127] - Time cost: 0.130s | Fillna Done
[909548:MainThread](2024-11-21 20:43:50,908) INFO - qlib.timer - [log.py:127] - Time cost: 0.048s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = t
[909548:MainThread](2024-11-21 20:43:50,983) INFO - qlib.timer - [log.py:127] - Time cost: 0.074s | CSRankNorm Done
[909548:MainThr

In [2]:
ds.prepare("train")

<qlib.data.dataset.TSDataSampler at 0x74e6f471fb90>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
#%%
from qlib.data.dataset.handler import DataHandlerLP
dl_train = ds.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = ds.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)


In [4]:
dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill")

In [5]:
import numpy as np
import torch.nn as nn
import torch
import math
seed = 32
np.random.seed(seed) 
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)

train_loader = DataLoader(
dl_train, batch_size=8192, shuffle=False, num_workers=0, drop_last=True
)
valid_loader = DataLoader(
dl_valid, batch_size=8192, shuffle=False, num_workers=0, drop_last=True
)

In [6]:
from transformerModule import train_transformer, test_transformer
n_epochs = 10
for epoch in range(n_epochs):
    print("Epoch %d" % epoch)
    train_transformer(train_loader)
    train_loss, train_score =test_transformer(train_loader)
    val_loss, val_score = test_transformer(valid_loader)
    print("train %.6f, valid %.6f" % (train_score, val_score))

Epoch0.000000
train 1.329304, valid 1.459009
Epoch1.000000
train 1.257950, valid 1.211868
Epoch2.000000
train 1.907494, valid 1.844071
Epoch3.000000
train 1.158470, valid 1.329375
Epoch4.000000
train 1.206187, valid 1.379025
Epoch5.000000
train 1.872241, valid 1.749104
Epoch6.000000
train 1.766189, valid 1.338321
Epoch7.000000
train 1.622868, valid 1.905324
Epoch8.000000
train 1.560341, valid 1.199064
Epoch9.000000
train 1.204868, valid 1.966143


In [6]:
for batch_idx, inputs in enumerate(train_loader):  
    if batch_idx == 1:  
        one_batch_data = inputs
        break   


In [7]:
one_batch_data

tensor([[[ 0.0523, -1.0427,  0.1084,  ..., -0.2180,  0.0299,  0.9825],
         [ 0.0520, -1.1046,  0.1167,  ..., -0.3739,  0.0296, -0.9343],
         [-0.1037, -1.0492, -0.2168,  ..., -0.7956, -0.9760, -0.7053],
         ...,
         [ 0.2251, -0.4222,  0.2639,  ..., -3.0000,  0.2505, -0.0363],
         [ 0.2804, -0.2415,  0.2918,  ..., -3.0000,  0.1524,  0.5968],
         [ 0.1116, -1.3579,  0.3794,  ..., -3.0000,  0.1392, -0.2713]],

        [[ 0.0520, -1.1046,  0.1167,  ..., -0.3739,  0.0296, -0.9343],
         [-0.1037, -1.0492, -0.2168,  ..., -0.7956, -0.9760, -0.7053],
         [-0.4136, -0.9947, -0.8094,  ..., -0.8063, -0.1453, -0.6932],
         ...,
         [ 0.2804, -0.2415,  0.2918,  ..., -3.0000,  0.1524,  0.5968],
         [ 0.1116, -1.3579,  0.3794,  ..., -3.0000,  0.1392, -0.2713],
         [ 0.1667, -0.8083,  0.2678,  ..., -2.0803,  0.3007,  0.3556]],

        [[-0.1037, -1.0492, -0.2168,  ..., -0.7956, -0.9760, -0.7053],
         [-0.4136, -0.9947, -0.8094,  ..., -0

In [8]:
# 一个batch的数据
one_batch_data.shape

torch.Size([300, 20, 159])

In [9]:

# feature: (8192,20,158)
feature = one_batch_data[:, :, 0:-1]
# 8192
label = one_batch_data[:, -1, -1]
print(feature.shape)
print(label.shape)


torch.Size([300, 20, 158])
torch.Size([300])


In [10]:
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
import math
seed = 32
np.random.seed(seed) 
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)
from transformerModule import PositionalEncoding, MultiHeadAttention,TransformerModule
# input = torch.randn(20,5,158)
# mul_att1 = MultiHeadAttention(2,512,0)
# mul_att_out1 = mul_att1.forward(input)
# print(mul_att_out1)
# mul_att2 = PositionalEncoding(512, 100)
# mul_att_out2 = mul_att2.forward(input.float())
# 模型输出测试
model = TransformerModule(158, 512, 0.1, 6)
pred = model.forward(feature.float()).squeeze(-1)
train_optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-3)
mask = ~torch.isnan(label)
loss = torch.mean((pred[mask]-label[mask])**2)
train_optimizer.zero_grad()
loss.backward()
train_optimizer.step()
print(pred.shape)
print(pred)


torch.Size([100, 512])
torch.Size([300, 20, 512])
torch.Size([300])
tensor([-1.5486e-01, -2.5506e-01, -2.0934e-01,  1.2445e-01, -3.4901e-01,
        -1.0464e-01,  7.1646e-01,  5.8745e-02, -9.0592e-02,  4.0081e-01,
         6.5468e-02,  8.1091e-02,  7.2064e-01,  2.9356e-01,  3.7602e-01,
         5.4290e-01, -6.7158e-02,  1.4845e-01,  8.6888e-02, -2.5410e-01,
         1.5472e-01, -7.2571e-01,  1.3594e-01,  1.3590e-01, -1.2822e-01,
        -3.8044e-03,  5.7650e-02, -3.9975e-01,  5.1703e-02,  1.1508e-01,
        -5.7981e-02,  2.7665e-01,  2.8618e-01,  2.6640e-01, -1.0574e-01,
         1.5599e-01,  2.3357e-01, -1.3063e-01,  3.8901e-01, -3.6684e-01,
         2.3378e-01,  8.2557e-01, -2.3710e-01,  9.7924e-02, -3.4977e-01,
         1.1858e-01,  1.4912e-01,  7.8634e-02, -5.1963e-01, -2.0113e-01,
        -1.4976e-01, -5.9080e-02,  2.0908e-01, -3.5377e-01, -7.0248e-01,
         3.9274e-01, -2.0073e-02,  1.2155e-01, -2.3120e-01,  4.1904e-02,
         4.5909e-02,  3.1729e-01, -1.7037e-02, -2.3743e-

In [12]:
torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device(type='cuda', index=0)

In [19]:
test = torch.rand(3,4,5)

In [20]:
test

tensor([[[0.7605, 0.8524, 0.6648, 0.9480, 0.4735],
         [0.8731, 0.3785, 0.3258, 0.2027, 0.7167],
         [0.8117, 0.1121, 0.6466, 0.9719, 0.9213],
         [0.3831, 0.3930, 0.5797, 0.6329, 0.2640]],

        [[0.9705, 0.4962, 0.7696, 0.2525, 0.0741],
         [0.3559, 0.4318, 0.9146, 0.9795, 0.4408],
         [0.9902, 0.5128, 0.5951, 0.8934, 0.2516],
         [0.4185, 0.4892, 0.6723, 0.3938, 0.6720]],

        [[0.8764, 0.2443, 0.3268, 0.9680, 0.2575],
         [0.3969, 0.0577, 0.7201, 0.4786, 0.4490],
         [0.8341, 0.8497, 0.9446, 0.2704, 0.6773],
         [0.5130, 0.7120, 0.9063, 0.8501, 0.9215]]])

In [21]:
test[:,-1,:]

tensor([[0.3831, 0.3930, 0.5797, 0.6329, 0.2640],
        [0.4185, 0.4892, 0.6723, 0.3938, 0.6720],
        [0.5130, 0.7120, 0.9063, 0.8501, 0.9215]])

In [23]:
test[:,-1,:].shape

torch.Size([3, 5])

In [7]:
import torch
 
# 检查 PyTorch 版本
print("PyTorch version:", torch.__version__)
 
# 检查是否有可用的 CUDA
if torch.cuda.is_available():
    print("CUDA is available")
    # 打印出 CUDA 的版本信息
    print("CUDA version:", torch.version.cuda)
    # 打印出 GPU 的型号
    print("GPU model:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")
 
# 检查当前设备是 CPU 还是 GPU
device = "GPU" if torch.cuda.is_available() else "CPU"
print(f"Current device: {device}")

PyTorch version: 2.4.1+cu118
CUDA is available
CUDA version: 11.8
GPU model: NVIDIA GeForce RTX 4070 Ti SUPER
Current device: GPU
