In [None]:
%matplotlib inline
import os
os.chdir('d:/future/Index_Future_Prediction')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import optuna
import math
from datetime import datetime
import backtrader as bt


import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler, Adam, AdamW
from torch.utils.data import TensorDataset, DataLoader

from utils import *
from modules import *

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
class Patch_TST(nn.Module):
    """Patch Time Series Transformer"""
    def __init__(self, input_size, seq_len, patch_size, stride, num_layer, num_head, d_model, masking_ratio, mask_expand_size, dropout_1, dropout_2, dropout_3):
        super().__init__()
        self.device = 'cuda:0'
        self.input_size = input_size
        self.patch_size = patch_size
        self.stride = stride
        self.masking_ratio = masking_ratio
        self.mask_expand_size = mask_expand_size

        self.num_patch = int(np.floor((seq_len - patch_size) / stride) + 1)

        self.patch = TimeSeriesPatcher(patch_size, stride) # 首先经过patcher分成子序列

        self.projection = PatchProjection(input_size, patch_size, d_model = d_model, dropout = dropout_1)

        self.encoder = MultiLayerEncoder(dim_feature = d_model, dim_sequence = self.num_patch, num_enc_layer = num_layer, num_head = num_head, num_ffn_hidden = d_model*2, dropout = dropout_2)

        self.reconstruction = nn.Linear(d_model, input_size * patch_size)
        
        self.output = nn.Sequential(
            nn.Flatten(start_dim = -2),
            nn.Linear(self.num_patch * d_model, self.num_patch * d_model),
            nn.Dropout(dropout_3),
            HybridDecoder(dim_state = self.num_patch * d_model, init_prob = [0.0,0.0,0.0])
        )
    
    def self_supervised(self, x):
        """
        自监督预训练
        如果不允许patch重叠，正好被patch隔断的形态无法学习到。
        允许patch重叠，则模型预训练的时候可以从前后patch偷看到信息。
        需要用双重mask
        target mask 是真正需要重建的patch
        input mask 是target mask的扩展，根据系数向两侧扩展掩蔽范围。例如假如patch允许重叠50%，则向前后各多屏蔽一个patch就可以完全屏蔽掉信息。
        """
        device = x.device
        batch_size = x.shape[0]


        noise = torch.rand(size=(batch_size, self.num_patch), device=device)
        target_mask = noise < self.masking_ratio
        
        # 防止出现所有 patch 都没被 mask 的情况，至少 mask 一个随机选择一个 patch 进行 mask
        if not target_mask.any(dim=1).all():
            for i in range(batch_size):
                if not target_mask[i].any():
                    fallback_idx = torch.randint(0, self.num_patch, (1,)).item()
                    target_mask[i, fallback_idx] = True

        target_mask_float = target_mask.float().unsqueeze(1)
        kernel_size = 2 * self.mask_expand_size + 1
        kernel = torch.ones(1, 1, kernel_size, device=device)
        padding = self.mask_expand_size
        expanded_mask_float = F.conv1d(target_mask_float, kernel, padding=padding)
        input_mask = (expanded_mask_float > 0).squeeze(1)


        x_patched = self.patch(x)
        reshape_mask = input_mask.unsqueeze(-1)
        x_masked = torch.where(reshape_mask, 0.0, x_patched)
        x_projected = self.projection(x_masked)
        x_encodered = self.encoder(x_projected)
        
        x_pre_reconstruction = x_encodered[target_mask] # 仅关注target mask
        x_reconstructed = self.reconstruction(x_pre_reconstruction)
        x_target = x_patched[target_mask] # 仅关注target mask

        return x_reconstructed, x_target
    
    def forward(self, x):
        """前向传播输出"""
        x_patched = self.patch(x)
        x_projected = self.projection(x_patched)
        x_encodered = self.encoder(x_projected)
        output = self.output(x_encodered)
        return output

In [None]:
def objective(trial):

    # 需要调优的超参数
    seq_len = trial.suggest_int("seq_len", 60, 250)
    patch_size = trial.suggest_int("patch_size", 5, 30)
    num_layer = trial.suggest_categorical('num_layer', [1,2,3,4,5])
    num_head = trial.suggest_categorical('num_head', [4,8,16])
    d_model = trial.suggest_categorical('d_model', [32, 64, 128, 256])
    

    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    dropout_1 = trial.suggest_float("dropout_1", 0.0, 0.5)
    dropout_2 = trial.suggest_float("dropout_2", 0.0, 0.5)
    dropout_3 = trial.suggest_float("dropout_3", 0.0, 0.5)

    learning_rate_encoder = trial.suggest_float("learning_rate_encoder", 1e-6, 1e-2, log=True)
    weight_decay_encoder = trial.suggest_float("weight_decay_encoder", 1e-7, 1e-3, log=True)
    learning_rate_output = trial.suggest_float("learning_rate_output", 1e-6, 1e-2, log=True)
    weight_decay_output = trial.suggest_float("weight_decay_output", 1e-7, 1e-3, log=True)

    alpha = trial.suggest_float("alpha", 1e-2, 1e-1, log=True)
    delta = trial.suggest_float("delta", 1.0, 1.3)
    gamma = trial.suggest_float("gamma", 0.7, 1, log=True)


    # 由选择的超参数决定
    masking_ratio = 0.2
    mask_expand_size = trial.suggest_categorical('mask_expand_size', [1, 2, 3])
    stride = math.ceil(patch_size / (mask_expand_size + 1))


    model = Patch_TST(input_size = 10,
                    seq_len = seq_len,
                    patch_size = patch_size,
                    stride = stride,
                    num_layer = num_layer, 
                    num_head = num_head,
                    d_model = d_model,
                    masking_ratio = masking_ratio,
                    mask_expand_size = mask_expand_size,
                    dropout_1 = dropout_1,
                    dropout_2 = dropout_2,
                    dropout_3 = dropout_3,
                    ).to('cuda:0')
    
    output_params = model.output.parameters()
    output_param_ids = {id(p) for p in output_params}
    other_params = [p for p in model.parameters() if id(p) not in output_param_ids]
    optimizer_grouped_parameters = [
                {'params': other_params, 'lr': learning_rate_encoder, 'weight_decay': weight_decay_encoder},
                {'params': model.output.parameters(), 'lr': learning_rate_output, 'weight_decay': weight_decay_output}
            ]

    optimizer = torch.optim.AdamW(optimizer_grouped_parameters)


    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate_encoder, weight_decay=weight_decay_encoder)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)



    # 提取数据
    assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]

    feature_columns = ['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]
    label_columns = ['label_return','down_prob','middle_prob','up_prob']

    feature = []
    label = []
    for asset_code in assets_list:
        data = pd.read_csv(f'data/{asset_code}.csv')
        data = data[data['trade_date'] < 20230901].copy() # 所有2023年以后数据不参与训练
        feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
        label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

    feature = torch.stack(feature, dim = 1)
    label = torch.stack(label, dim = 1)
    feature = feature.unfold(dimension = 0, size = seq_len, step = 1).transpose(2,3)
    label = label[seq_len-1:]
    data = RandomLoader(feature, label)
    train_loader, test_loader = data(batch_size=batch_size, slice_size=[0.8,0.19], balance=[True, True])

    loss_fn = nn.MSELoss()

    def pre_epoch():
        train_losses = []
        model.train()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            x_reconstructed, x_target = model.self_supervised(batch_x)
            loss = loss_fn(x_reconstructed, x_target)
            train_losses.append(loss.item()) 
            loss.backward()
            optimizer.step()
            
        test_losses = []
        model.eval()
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                x_reconstructed, x_target = model.self_supervised(batch_x)
                loss = loss_fn(x_reconstructed, x_target)
                test_losses.append(loss.item()) 
        return np.mean(train_losses), np.mean(test_losses)

    def pre_train(epochs = 30):
        train_losses = []
        test_losses = []
        for i in tqdm.tqdm(range(epochs)):
            train_loss, test_loss = pre_epoch()
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            scheduler.step()
        plt.plot(range(epochs), train_losses)
        plt.plot(range(epochs), test_losses)
        plt.show()
        return np.mean(test_losses[-10:])

    pre_train(30)


    # 提取部分数据用于二阶段训练
    assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]
    # assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX']
    # assets_list = ['JM.DCE','RB.SHF','HC.SHF', 'I.DCE']

    feature = []
    label = []
    for asset_code in assets_list:
        data = pd.read_csv(f'data/{asset_code}.csv')
        data = data[data['trade_date'] < 20230901].copy() # 所有2023年以后数据不参与训练
        feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
        label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

    feature = torch.stack(feature, dim = 1)
    label = torch.stack(label, dim = 1)
    feature = feature.unfold(dimension = 0, size = seq_len, step = 1).transpose(2,3)
    label = label[seq_len-1:]
    data = RandomLoader(feature, label)
    train_loader, test_loader = data(batch_size=batch_size, slice_size=[0.8,0.19], balance=[True, True])


    loss_fn = HybridLoss(alpha = alpha, delta = delta, show_loss = False)

    def epoch():
        train_losses = []
        model.train()
        model.projection.eval()
        model.encoder.eval()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            pred = model(batch_x)
            loss = loss_fn(pred, batch_y)
            train_losses.append(loss.item()) 
            loss.backward()
            optimizer.step()
            
        test_losses = []
        model.eval()
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                pred = model(batch_x)
                loss = loss_fn(pred, batch_y)
                test_losses.append(loss.item()) 
        return np.mean(train_losses), np.mean(test_losses)

    def train(epochs = 30):
        train_losses = []
        test_losses = []
        for i in tqdm.tqdm(range(epochs)):
            train_loss, test_loss = epoch()
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            scheduler.step()
        plt.plot(range(epochs), train_losses)
        plt.plot(range(epochs), test_losses)
        plt.show()
        return np.mean(test_losses[-3:])

    final_loss = train(10)
    return final_loss


In [None]:
study = optuna.create_study(
    direction="minimize",
    study_name="patchtst_all",
    storage="sqlite:///data/db.sqlite3_all",  # 保存到 SQLite 文件
    load_if_exists=True # 如果存在同名study，则加载它
)

study.optimize(objective, n_trials=10)
print("最佳准确率: ", study.best_value)
print("最佳超参数: ", study.best_params)
df = study.trials_dataframe()
df.sort_values(by='value', ascending=False)

In [None]:
df.sort_values(by='value', ascending=False)

In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()