现在，我们可以搭建同时建模截面关系和时序关系的复合transformer架构了

我们需要重新定义encoder层，现在encoder层需要两次注意力关注，一次在时序上，关注本资产的前后序列；另一次在截面上，关注同时期的其他资产。

为什么不在整个回望窗口内进行一个大的注意力机制呢？因为复杂度问题，假设资产数是 M 时间步是 N 全局注意力的开销是 O (MN)^2

而时序和截面相当于是进行了两次稀疏注意力，且都是是具有比较强的可解释性的：分析一个时间点的信息，看一看前后和同时间点的其他资产，肯定比看不同时间的不同资产更重要吧？

那如果确实有滞后信息需要传递呢？假设真的存在某种滞后关系，例如资产A的价格波动会在10天之后传导到B，这种机制也会被多层注意力捕获，因为我们的encoder层也是多层重叠的。第一次A的资产波动会传导到10天后的A，第二次则会从10天后的A传到到10天后的B，从而完成这种滞后效应的建模

In [1]:
import os
os.chdir('d:/future/Index_Future_Prediction')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import optuna
import math

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler, Adam, AdamW
from torch.utils.data import TensorDataset, DataLoader

from utils import *
from modules import *

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
from modules.attention import MultiHeadAttention
from modules.addnorm import AddNorm
from modules.ffn import PositionWiseFFN

class PanelEncoderBlock(nn.Module):
    """
    Panel data transformer
    """
    def __init__(self, d_model, num_head, num_ffn_hidden, dropout):
        super().__init__()
        # 纵向时间序列注意力；
        self.time_series_attention = MultiHeadAttention(d_model, num_head)
        # 横向截面注意力；
        self.cross_section_attention = MultiHeadAttention(d_model, num_head)
        # addnorm 层
        self.addnorm = AddNorm(normalized_shape=(d_model, d_model), dropout=dropout)
        # 通过ffn 整合信息
        self.ffn = PositionWiseFFN(d_model, num_ffn_hidden, d_model)

    def forward(self, x, mask=None):
        """
        imput and output size: (batch_size, num_assets, seq_len or num_patch, d_model)
        """
        # 注意力机制会自动展平前面的层，把倒数第二层作为注意力的范围。对于时序注意力，倒数第二维度应该是时间步长度
        time_series_attention_out = self.time_series_attention(x,x,x, mask)
        x = self.addnorm(x, time_series_attention_out)
        # 这里交换num_assets 和 seq_len 来把资产数交换到倒数第二个维度上，让注意力关注截面
        x = x.permute(0,2,1,3)
        cross_section_attention = self.cross_section_attention(x,x,x, mask)
        x = self.addnorm(x, cross_section_attention)
        # 记得交换回来
        x = x.permute(0,2,1,3)
        x = self.addnorm(x, ffn_out)
        # 最后通过ffn 整理当前时间步内部的信息
        ffn_out = self.ffn(x)
        x = self.addnorm(x, ffn_out)
        return x


In [None]:
class MultiLayerPanelEncoder(nn.Module):
    """
    多层PanelEncoder，由多个PanelEncoderBlock堆叠而成
    """
    def __init__(self, num_layer, d_model, num_head, num_ffn_hidden, dropout):
        super().__init__()
        self.layers = nn.ModuleList([PanelEncoderBlock(d_model = d_model, num_head = num_head,num_ffn_hidden = num_ffn_hidden,dropout = dropout,)for _ in range(num_layer)])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

相比于预测单个资产，现在我们要预测一组资产

In [None]:
class PortfolioExpand(nn.Module):
    """
    创建资产组合来增强数据，这些组合是原始资产的线性组合，权重被约束为 L1 范数等于 1，以模拟一个全额投资的（多空）投资组合。
    """
    def __init__(self, expand_dim):
        super().__init__()
        self.expand_dim = expand_dim

    def _portfolio_weights(self, batch_size, num_assets, device) :
        """生成资产组合矩阵，在最后一个维度上绝对值之和等于1"""
        weights = 2 * torch.rand(batch_size, self.expand_dim, num_assets, device=device) - 1
        l1_norms = torch.sum(torch.abs(weights), dim=-1, keepdim=True)
        normalized_weights = weights / (l1_norms + 1e-16)
        return normalized_weights

    def forward(self, x, y):
        """
        输入:
            - x: (batch_size, num_assets, seq_len, feature_dim)
            - y: (batch_size, num_assets, label_dim)
        输出:
            - expanded_x: (batch_size, num_assets + expand_dim, seq_len, feature_dim)
            - expanded_y: (batch_size, num_assets + expand_dim, label_dim)
        """
        # 验证输入维度
        if x.dim() != 4 or y.dim() != 3:
            raise ValueError(f"输入维度错误! x应为4维, y应为3维, "
                             f"但实际为 x: {x.dim()}维, y: {y.dim()}维")
        if x.shape[0] != y.shape[0] or x.shape[1] != y.shape[1]:
            raise ValueError("x 和 y 的 batch_size 和 num_assets 维度必须匹配")

        batch_size, num_assets, seq_len, feature_dim = x.shape
        device = x.device
        weights = self._portfolio_weights(batch_size, num_assets, device)

        # torch.einsum 可以在第二个维度上进行矩阵乘法（也可以用bmm）
        expanded_x = torch.einsum('ben,bnsf->besf', weights, x)
        expanded_y = torch.einsum('ben,bnl->bel', weights, y)
        
        # 拼接基资产和组合资产
        output_x = torch.cat([x, expanded_x], dim=1)
        output_y = torch.cat([y, expanded_y], dim=1)

        return output_x, output_y

为什么不选用经典transformer的加性位置编码呢？原因有两个，加性位置编码的优点是节省维度，但缺点是模型需要首先学会从汇总的信息中分离位置信息和原始信息。

但是我们的时序预测维度并不高，不需要节省维度，反而我们的数据量是不足的，没必要浪费额外的成本来训练这个，因此采用concate的位置编码更好。

In [None]:
import torch
import torch.nn as nn
import math

class TemporalEmbedding(nn.Module):
    """
    Time2Vec时序编码，以concat形式扩展位置编码。
    原始输入维度: (*, seq_len, d_model)
    输出维度: (*, seq_len, d_model + dim_embedding)
    """
    def __init__(self, dim_embedding):
        super(TemporalEmbedding, self).__init__()
        self.dim_embedding = dim_embedding
        
        # 定义 Time2Vec 的可学习参数
        # 根据论文，第一个特征是线性的，其余的是周期性的（通过sin函数）
        # 我们使用一个线性层来实现，这等效于创建权重(w)和偏置(b)参数
        # 输入是1维的时间步索引，输出是 dim_embedding 维的向量
        self.w = nn.Parameter(torch.empty(1, self.dim_embedding), requires_grad=True)
        self.b = nn.Parameter(torch.empty(1, self.dim_embedding), requires_grad=True)
        
        # 初始化参数
        nn.init.uniform_(self.w, -0.1, 0.1)
        nn.init.uniform_(self.b, -0.1, 0.1)

    def forward(self, x):
        """
        前向传播。
        参数:
            x (torch.Tensor): 输入张量，形状为 (*, seq_len, d_model)
        返回:
            torch.Tensor: 输出张量，形状为 (*, seq_len, d_model + dim_embedding)
        """
        # 保存初始形状
        original_shape = x.shape # (*, seq_len, feature_dim)
        seq_len = original_shape[-2]
        batch_dims = original_shape[:-2]
        
        # 相对时间序号： [0, 1, 2, ..., seq_len-1]
        tau = torch.arange(seq_len, dtype=torch.float, device=x.device).unsqueeze(-1)

        # 计算时间嵌入
        # 这是 Time2Vec 的核心计算: f(τ) = ωτ + φ
        # tau (seq_len, 1) @ w (1, dim_embedding) -> (seq_len, dim_embedding)
        time_embedding = tau @ self.w + self.b
        
        linear_part = time_embedding[:, :1] # 线性部分
        periodic_part = torch.sin(time_embedding[:, 1:]) # 周期性部分

        time_embedding = torch.cat([linear_part, periodic_part], dim=-1)

        # 把编码广播到所有维度
        target_shape = batch_dims + (seq_len, self.dim_embedding)
        time_embedding = time_embedding.expand(target_shape)

        # 拼接
        output = torch.cat([x, time_embedding], dim=-1)
        
        return output


Pre_train:
x 依次通过 patch projection mask -> 分支1(masked x) -> temporal_embedding assets_embedding portfolio_expand(expand = 0) encoder reconstruction ->
                                -> 分支2(target x) -> 
train:
x 依次通过 patch projection  positional_encoder assets_embedding portfolio_expand(expand != 0) encoder output
y 依次通过 portfolio_expand(expand != 0)

eval:

上述模块中 projection encoder reconstruction output 是可学习的，其他的不是。

In [None]:
class Panel_Transformer(nn.Module):
    """Panel Time Series Transformer"""
    def __init__(self, input_size, seq_len, patch_size, stride, num_layer, num_head, d_model, masking_ratio, mask_expand_size, dropout_1, dropout_2, dropout_3):
        super().__init__()
        # 模型参数
        self.device = 'cuda:0'
        self.input_size = input_size
        self.patch_size = patch_size
        self.stride = stride
        self.masking_ratio = masking_ratio
        self.mask_expand_size = mask_expand_size
        self.num_patch = int(np.floor((seq_len - patch_size) / stride) + 1)

        # 前置层
        self.patch = TimeSeriesPatcher(patch_size, stride)
        self.projection = nn.Linear(input_size * patch_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model = d_model)

        self.assets_embedding = nn.Embedding(num_embeddings = 53, embedding_dim = 12, _freeze = True)
        self.assets_embedding.load_state_dict(torch.load('params/assets_embedding.params'))
        self.portfolio_expand = PortfolioExpand(expand_dim = 10)
        self.encoder = MultiLayerPanelEncoder(num_layer = num_layer, d_model = d_model, num_head = num_head, num_ffn_hidden = d_model * 2, dropout = dropout_2)

        # 在投影之后才进行资产嵌入
        # 一方面，可以直接传递给encoder未经处理的原始嵌入信息，避免projection还没得到学习的时候，初始的projection会错乱掉embedding关系
        # 另一方面，每个patch只会被嵌入一次，避免一开始在daily级别就进行嵌入，导致同一个patch内有大量的冗余信息

        # 预训练输出层
        self.reconstruction = nn.Linear(d_model, input_size * patch_size)

        # 输出层
        self.output = nn.Sequential(
            nn.Flatten(start_dim = -2),
            nn.Linear(self.num_patch * d_model, self.num_patch * d_model),
            nn.Dropout(dropout_3),
            HybridDecoder(dim_state = self.num_patch * d_model, init_prob = [0.0,0.5,0.0])
        )
    

    def self_supervised(self, x):
        """
        自监督预训练 
        修改点:
        - 掩蔽范围从 (batch, patch) 扩展到 (batch, asset, patch)。
        - 在每个batch内部，对所有asset的所有patch进行随机抽样掩蔽。
        """
        device = x.device
        batch_size = x.shape[0]
        # === 新增: 获取assets维度 ===
        num_assets = x.shape[1] 
        
        # === 修改 1: 改变noise和mask的形状 ===
        # 原始形状: (batch_size, self.num_patch)
        # 新形状: (batch_size, num_assets, self.num_patch)
        noise = torch.rand(size=(batch_size, num_assets, self.num_patch), device=device)
        target_mask = noise < self.masking_ratio
        
        # === 修改 2: 更新后备逻辑以处理新形状 ===
        # 检查每个batch样本是否至少有一个patch被mask (跨所有assets)
        # a.view(batch_size, -1) 将 (B, A, P) -> (B, A*P)
        if not target_mask.view(batch_size, -1).any(dim=1).all():
            for i in range(batch_size):
                # 如果第i个样本中没有任何一个patch被mask
                if not target_mask[i].any():
                    # 随机选择一个asset和一个patch进行mask
                    fallback_asset_idx = torch.randint(0, num_assets, (1,)).item()
                    fallback_patch_idx = torch.randint(0, self.num_patch, (1,)).item()
                    target_mask[i, fallback_asset_idx, fallback_patch_idx] = True

        # === 修改 3: 调整形状以适应conv1d ===
        # F.conv1d 需要一个3D输入 (N, C_in, L_in)
        # 我们将 (B, A, P) -> (B*A, 1, P)
        target_mask_float = target_mask.float().view(batch_size * num_assets, 1, self.num_patch)
        
        kernel_size = 2 * self.mask_expand_size + 1
        kernel = torch.ones(1, 1, kernel_size, device=device)
        padding = self.mask_expand_size
        expanded_mask_float = F.conv1d(target_mask_float, kernel, padding=padding)
        
        # 将形状恢复为 (B, A, P)
        input_mask = (expanded_mask_float > 0).squeeze(1).view(batch_size, num_assets, self.num_patch)
        
        x_patched = self.patch(x)  # x: (B, A, S, F) -> x_patched: (B, A, P, PF)
        
        # reshape_mask 现在是 (B, A, P, 1), 可以完美广播到 (B, A, P, PF)
        reshape_mask = input_mask.unsqueeze(-1)
        x_masked = torch.where(reshape_mask, 0.0, x_patched)
        
        x_projected = self.projection(x_masked)
        x_encodered = self.encoder(x_projected) # -> (B, A, P, d_model)
        
        # 使用 (B, A, P) 的 target_mask 直接进行布尔索引，这会选取所有为True的元素并展平
        x_pre_reconstruction = x_encodered[target_mask]
        x_reconstructed = self.reconstruction(x_pre_reconstruction)
        x_target = x_patched[target_mask]

        return x_reconstructed, x_target
        
    def forward(self, x):
        """前向传播输出"""
        x_patched = self.patch(x)
        x_projected = self.projection(x_patched)
        x_encodered = self.encoder(x_projected)
        output = self.output(x_encodered)
        return output

在数据处理层，有很大不同；因为我们现在一次输入的是一组资产，不能再以某一个资产的涨跌来进行训练均衡了我们直接调用原生的 dataset 和 dataloader

In [None]:
# 固定参数
seq_len = 120
patch_size = 8
num_layer = 2
num_head = 16
d_model = 128

masking_ratio = 0.2
mask_expand_size = 1
stride = 4

assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]

# 可变参数
batch_size = 32
dropout_1 = 0.18965831923308327
dropout_2 = 0.1430970459619855
dropout_3 = 0

learning_rate = 0.001615257095302926
weight_decay = 3.5940297438123993e-06
gamma = 0.8462706280335419


# 提取数据
feature_columns = ['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]
label_columns = ['label_return','down_prob','middle_prob','up_prob']

feature = []
label = []
for asset_code in assets_list:
    data = pd.read_csv(f'data/{asset_code}.csv')
    data = data[data['trade_date'] < 20230901].copy() # 所有2023年以后数据不参与训练
    feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
    label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

feature = torch.stack(feature, dim = 1)
label = torch.stack(label, dim = 1)
feature = feature.unfold(dimension = 0, size = seq_len, step = 1).transpose(2,3)
label = label[seq_len-1:]

data = RandomLoader(feature, label)
train_loader, test_loader = data(batch_size=batch_size, slice_size=[0.7,0.29], balance=[True, True])

torch.Size([1984, 10, 120, 10])


In [4]:
for x, y in train_loader:
    print (x.shape)
    break

torch.Size([32, 120, 10])


In [None]:

loss_fn = nn.MSELoss()
model = Patch_TST(input_size = 10,
                    seq_len = seq_len,
                    patch_size = patch_size,
                    stride = stride,
                    num_layer = num_layer, 
                    num_head = num_head,
                    d_model = d_model,
                    masking_ratio = masking_ratio,
                    mask_expand_size = mask_expand_size,
                    dropout_1 = dropout_1,
                    dropout_2 = dropout_2,
                    dropout_3 = dropout_3,
                    ).to('cuda:0')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)


def epoch():
    train_losses = []
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        x_reconstructed, x_target = model.self_supervised(batch_x)
        loss = loss_fn(x_reconstructed, x_target)
        train_losses.append(loss.item()) 
        loss.backward()
        optimizer.step()
        
    test_losses = []
    model.eval()
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            x_reconstructed, x_target = model.self_supervised(batch_x)
            loss = loss_fn(x_reconstructed, x_target)
            test_losses.append(loss.item()) 
    return np.mean(train_losses), np.mean(test_losses)

def train(epochs = 30):
    train_losses = []
    test_losses = []
    for i in tqdm.tqdm(range(epochs)):
        train_loss, test_loss = epoch()
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        scheduler.step()
    plt.plot(range(epochs), train_losses)
    plt.plot(range(epochs), test_losses)
    plt.show()
    return np.mean(test_losses[-10:])

final_loss = train(30)
print(final_loss)
torch.save(model.state_dict(), 'params/self_supervised_1.params')