二、训练模式

由于金融数据的噪声占比实在是过大，模型在学习到有效特征之前，很容易被噪声影响。其中最重要对结果产生影响的就是训练数据的选择范围。

由于时序数据必须强制保证训练集在前，测试集在后，我们通过固定测试集、验证集大小，随机选择训练集起点的方式，给训练过程注入随机性，防止模型过拟合单一数据集，无法迁移到其他数据上的问题

In [23]:
import numpy as np
import pandas as pd
import torch

data = pd.read_csv('data/market_state.csv', index_col='Unnamed: 0')
data.dropna(inplace=True)

feature_column = ['near_maturity','on', 'm1', 'y1', 'y10',]
for index in ['ic']:
    feature_column.append(f'change_{index}')
    feature_column.append(f'amount_{index}')
    feature_column.append(f'near_discount_{index}')
    feature_column.append(f'far_discount_{index}')
    feature_column.append(f'%_diff_{index}_1')
    feature_column.append(f'%_diff_{index}_5')
    feature_column.append(f'%_diff_{index}_20')
    feature_column.append(f'%_diff_{index}_60')

    feature_column.append(f'MACD_{index}')
    feature_column.append(f'RS_{index}')
    feature_column.append(f'RSI_{index}')
    feature_column.append(f'ATR_{index}')
for i in feature_column:
    data[i] = (data[i]-data[i].mean()) / data[i].std() 
feature = data[feature_column].copy()

hist_len = 30
INPUT_SIZE = len(feature_column)

x = np.lib.stride_tricks.sliding_window_view(feature, window_shape = hist_len, axis = 0, writeable=True)
x = torch.tensor(x, dtype = torch.float32, device='cuda:0').transpose(1,2)
y = data[['label_ic_ch_next_week', 'label_volatility_ic_week']][hist_len-1:].values
y = torch.tensor(y, dtype = torch.float32, device='cuda:0')

train_size = 500
test_size = 250
validation_size = 250

split = np.random.randint(train_size, len(feature) - test_size - validation_size) #　固定验证集和测试集大小之后，随机选择划分方式。之所以选择向后划分，是为了能通过随机载入位置，形成多种的训练集和测试集形态

x_train = x[:split].to('cuda:0')
y_train = y[:split].to('cuda:0')

x_test = x[split:split+test_size].to('cuda:0')
y_test = y[split:split+test_size].to('cuda:0')

x_validation = x[split+test_size:split+test_size+validation_size].to('cuda:0')
y_validation = y[split+test_size:split+test_size+validation_size].to('cuda:0')

此外，在训练过程中还容易存在一个问题：当训练集数据本身具有趋势时，由于模型较为复杂，很容易导致模型记住这种趋势，从而给出有偏的预期。

例如，当随机选择的训练集处在一个下跌趋势中，训练出来的模型对绝大部分样本都会给出偏负面的预期。

因为模型记住了训练的下跌趋势，这样对于模型而言，无脑预测下跌就可以减少损失。

为了防止这种情况，我们需要再每次前向传播和反向传播的时候，保证模型学习到等量的两个方向的数据，即在每个batch内进行标签的均衡

对此我们定义了BalanceDataLoader类，在将训练数据封装进类中之后，可以通过get_batch_data方法得到均衡好的小批量训练数据；

而在测试集上，也可以通过指定balance=False来模拟模型面对真实的有趋势的市场下的表现。

此外，实例化BalanceDataLoader时，也可以指定 discrete=True 来生成用于分类训练的离散变量

In [24]:
import numpy as np
import torch

class BalanceDataLoader():
    """
    均衡的数据加载器，在训练时生成标签均衡的batch data
    """
    def __init__(self, x, y, discrete = False, threshold = 0):
        self.x = x
        self.y = y
        self.length = len(y)
        self.discrete = discrete
        self.threshold = threshold
    

    def random_index(self, direction = None, max_attempts=1000):
        
        attempts = 0
        while attempts < max_attempts:
            random_index = np.random.randint(0, self.length)
            if (self.y[random_index].item() > 0) == direction:
                return random_index
            attempts += 1
        raise ValueError(f"在 {max_attempts} 次尝试后未找到合法值")
    
    def get_batch_data(self, batch_size, balance = True):
        batched_x = []
        batched_y = []

        indexes = np.random.randint(0, self.length, batch_size)

        if balance:
            indexes = []
            direction = True
            for i in range(batch_size):
                indexes.append(self.random_index(direction = direction))
                direction = not direction
        
        for i in indexes:
            batched_x.append(self.x[i])
            batched_y.append(self.y[i])
        
        batched_x = torch.stack(batched_x, dim = 0)
        batched_y = torch.stack(batched_y, dim = 0)

        if self.discrete:
            if self.threshold == 0:
                positive = (batched_y>=0).float()
                negative = (batched_y<0).float()
                batched_y = torch.concat((positive, negative), dim = 1)
            else:
                positive = (batched_y>=self.threshold).float()
                neutral = ((batched_y<self.threshold) & (batched_y>=-self.threshold)).float()
                negative = (batched_y<-self.threshold).float()
                batched_y = torch.concat((positive, neutral ,negative), dim = 1)
        return batched_x, batched_y

接下来，我们定义PredictionRecorder类，用于在训练过程中记录每次训练的logits用于给下一个类输送数据

In [25]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import  confusion_matrix

class PredictionRecorder:
    """
    记录和分析预测结果的类。
    """

    def __init__(self, is_logits = True):
        self.records = pd.DataFrame(columns=[
            'pred_neg', 'pred_abstain', 'pred_pos',
            'logit_neg', 'logit_abstain', 'logit_pos',
            'real', 'predicted_class'
        ])
        self.is_logits = is_logits

    def add(self, predict: torch.Tensor, real: torch.Tensor):
        if predict.shape[0] != real.shape[0]:
            raise ValueError("预测张量和真实值张量的batch_size必须相同。")
        if predict.dim() != 2 or predict.shape[1] != 3:
            raise ValueError("预测张量的形状必须是 (batch_size, 3)。")
        if real.dim() != 2 or real.shape[1] != 1:
            raise ValueError("真实值张量的形状必须是 (batch_size, 1)。")

        if self.is_logits :
            prob = torch.softmax(predict, dim = 1).cpu().detach().numpy()
            logits = predict.cpu().detach().numpy()
        else:
            prob = predict.cpu().detach().numpy()
            logits = torch.log(predict + 1e-9).cpu().detach().numpy()
        
        predicted_class = torch.argmax(predict, dim=1).cpu().detach().numpy()

        new_records_df = pd.DataFrame({
            'pred_neg': prob[:, 0],
            'pred_abstain': prob[:, 1],
            'pred_pos': prob[:, 2],
            'logit_neg': logits[:, 0],
            'logit_abstain': logits[:, 1],
            'logit_pos': logits[:, 2],
            'real': real.squeeze().cpu().detach().numpy(),
            'predicted_class': predicted_class,
        })
        self.records = pd.concat([self.records, new_records_df], ignore_index=True)

    def clear(self):
        self.__init__()

    def summary(self, threshold: float = 0.0) -> pd.DataFrame:
        """
        Generates and prints a detailed classification performance summary DataFrame.
        """
        if self.records.empty:
            print("记录为空，无法生成摘要。")
            return pd.DataFrame()

        # 1. Classify 'real' values
        def classify_real(value):
            if value < -abs(threshold): return 0
            elif value > abs(threshold): return 2
            else: return 1

        y_true = self.records['real'].apply(classify_real)
        y_pred = self.records['predicted_class']

        # 2. FIX: Explicitly convert dtypes to int to avoid the ValueError
        y_true = y_true.astype(int)
        y_pred = y_pred.astype(int)
        
        # 3. Compute confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
        
        # (The rest of the method remains unchanged)
        results = []
        for i in range(3):
            tp = cm[i, i]
            predicted_count = cm[:, i].sum()
            true_count = cm[i, :].sum()
            precision = tp / predicted_count if predicted_count > 0 else 0
            recall = tp / true_count if true_count > 0 else 0
            severe_error = 0
            if i == 0:
                severe_error = cm[2, 0] / predicted_count if predicted_count > 0 else 0
            elif i == 2:
                severe_error = cm[0, 2] / predicted_count if predicted_count > 0 else 0
            results.append({
                '预测为该分类的个数': predicted_count,
                'Precision (精确率)': precision,
                '真实为该分类的个数': true_count,
                'Accuracy (召回率)': recall,
                'Severe (严重错误率)': severe_error
            })

        total_samples = cm.sum()
        total_correct = np.trace(cm)
        total_severe_errors = cm[2, 0] + cm[0, 2]
        overall_accuracy = total_correct / total_samples if total_samples > 0 else 0
        overall_severe_rate = total_severe_errors / total_samples if total_samples > 0 else 0
        results.append({
            '预测为该分类的个数': total_samples,
            'Precision (精确率)': overall_accuracy,
            '真实为该分类的个数': total_samples,
            'Accuracy (召回率)': overall_accuracy,
            'Severe (严重错误率)': overall_severe_rate
        })
        summary_df = pd.DataFrame(results, index=['分类 0 (负)', '分类 1 (放弃)', '分类 2 (正)', '总计'])
        # print(f"--- 基于阈值 {threshold} 的分类性能摘要 ---")
        # print(summary_df.to_string(float_format="%.4f"))
        return summary_df

    def distribution(self) -> tuple[float, float, float]:
        if self.records.empty:
            return (0.0, 0.0, 0.0)
        props = self.records['predicted_class'].value_counts(normalize=True).reindex([0, 1, 2]).fillna(0)
        return (props[0], props[1], props[2])

    def average_score(self) -> tuple[float, float, float]:
        """
        计算三个分类的 logits 的全局平均值。

        返回:
            tuple[float, float, float]: 一个包含三个浮点数的元组，
                                        分别代表 logit_neg, logit_abstain, logit_pos 的平均值。
                                        如果没有任何记录，则返回 (0.0, 0.0, 0.0)。
        """
        if self.records.empty:
            return (0.0, 0.0, 0.0)

        # 选取 logits 相关的列
        logit_columns = ['logit_neg', 'logit_abstain', 'logit_pos']
        
        # 使用 .mean() 计算每列的平均值
        avg_logits = self.records[logit_columns].mean()

        return (avg_logits['logit_neg'], avg_logits['logit_abstain'], avg_logits['logit_pos'])

PredictionRecorder.distribution() 和 PredictionRecorder.average_score()方法用于向Animator传递prob 和 logits

In [26]:
import matplotlib.pyplot as plt
from IPython import display
import numpy as np

class Animator:
    """在动画中绘制数据，用于在模型训练中动态监控损失、预测概率、logits的变化。"""

    def __init__(self, figsize=(12, 6)):
        self.num_subplots = 6
        self.reset()
        self.fig, self.axes = plt.subplots(2, 3, figsize=figsize)
        self.axes = self.axes.flatten()
        titles = ['train loss', 'train classes prob', 'train classes logits', 'test loss', 'test classes prob', 'test classes logits']
        for i, ax in enumerate(self.axes):
            ax.set_title(titles[i])
            ax.grid()
        self.fig.tight_layout()

    def add(self, x, y, subplot_idx=0):
        """
        向指定的子图添加数据点。
        参数:
            x : 当前epoch
            y : 记录的值，对于prob和logits，传入元组
            subplot_idx (int): 子图的编号
        """
        if subplot_idx < 0 or subplot_idx >= self.num_subplots:
            raise ValueError(f"subplot_idx must be between 0 and {self.num_subplots - 1}.")
            
        target_plot = self.data[subplot_idx]
        
        # 确保y是列表
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        # 确保x是列表
        if not hasattr(x, "__len__"):
            x = [x] * n
            
        # 第一次添加数据时需要初始化
        if not target_plot['X']:
            target_plot['X'] = [[] for _ in range(n)]
            target_plot['Y'] = [[] for _ in range(n)]

        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                target_plot['X'][i].append(a)
                target_plot['Y'][i].append(b)

        self.draw()

    def draw(self):
        """绘制子图"""
        display.clear_output(wait=True)
        for i, ax in enumerate(self.axes):
            ax.cla()
            plot_data = self.data[i]
            if plot_data['X']:
                fmts = ('-', 'm--', 'g-.', 'r:')
                for j in range(len(plot_data['X'])):
                    ax.plot(plot_data['X'][j], plot_data['Y'][j], fmts[j % len(fmts)])
            ax.legend()
        self.fig.tight_layout()
        display.display(self.fig)

    def reset(self):
        """清空数据"""
        self.data = [{'X': [], 'Y': []} for _ in range(self.num_subplots)]
        print("Animator data has been reset.")

最后，将所有的训练过程封装进训练循环中

需要向循环内传递：

1. BalanceDataLoader封装的训练集、测试集；

2. 实例化的损失函数；

3. 实例化的优化器；

4. 实例化的学习率调度器；

5. PredictionRecorder实例化的recorder

6. Animator实例化的graph

In [27]:
import tqdm
import torch

def round_train(model, data_set, loss_fn, optimizer, is_train, round, recorder):
    if is_train:
        model.train()
    else:
        model.eval()
    sum_loss = 0
    recorder.clear()
    if is_train:
        for i in tqdm.tqdm(range(round)):
            batch_x, batch_y = data_set.get_batch_data(batch_size = 100, balance = True)
            optimizer.zero_grad()
            pred = model(batch_x)
            loss = loss_fn(pred, batch_y)
            sum_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
            optimizer.step()
    else:
        with torch.no_grad():
            for i in range(round):
                batch_x, batch_y = data_set.get_batch_data(batch_size = 100, balance = True)
                pred = model(batch_x)
                loss = loss_fn(pred, batch_y)
                sum_loss += loss.item()
    return sum_loss/round

def epoch_train_test(model,train_set, test_set, loss_fn, optimizer, scheduler ,epochs, graph, recorder, continue_train = 0):
    if continue_train == 0:
        graph.reset()

    for epoch in range(epochs):
        current_epoch = epoch + 1 + continue_train
        train_loss = round_train(model = model, data_set = train_set, loss_fn = loss_fn, optimizer = optimizer, is_train = True, round = 100, recorder = recorder)
        test_loss = round_train(model = model, data_set = test_set, loss_fn = loss_fn, optimizer = optimizer, is_train = False, round = 100, recorder = recorder)

        scheduler.step()
        graph.add(current_epoch, train_loss, subplot_idx = 0)
        graph.add(current_epoch, test_loss, subplot_idx = 3)
        graph.draw()