In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from datetime import datetime, timedelta
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cpu")

In [3]:
X = np.load('npy_v2/X_fe.npy')
Y = np.load('npy_v2/Y_fe.npy')
dates = np.load('npy_v2/Y_dates.npy')

print('Shape of X: ', X.shape)
print('Shape of Y: ', Y.shape)

Shape of X:  (930008, 15, 30)
Shape of Y:  (930008,)


In [4]:
'''
对X进行窗口标准化
'''
class myDataset(Dataset):
    '''
    自定义数据集，将原始数据从 numpy arrays 转换成 float 格式的 tensors
    '''
    
    def __init__(self, X, y, scaler = None, is_train = True):
        super(myDataset, self).__init__()
        self.y = y.reshape(-1, 1)
        
        self.origin_shape = X.shape
        
        # (B, n, T) → (B*T, n)
        X_2d = X.reshape(-1, self.origin_shape[1]) 
        
        #训练模式，同时完成 拟合（计算均值和标准差） 和 转换（应用标准化）
        if is_train: 
            self.scaler = StandardScaler()
            X_trans = self.scaler.fit_transform(X_2d)
            #X_trans = np.clip(X_trans, -5, 5)  # 限制标准化后的值在±5个标准差内
        
        #验证/测试模式，仅进行 转换（应用标准化），不重新计算均值和标准差
        #预先计算好的均值和标准差存储在标准化器（StandardScaler）的内部属性中
        
        else: 
            self.scaler = scaler
            X_trans = self.scaler.transform(X_2d)
            
        self.X = X_trans.reshape(self.origin_shape)
        self.X = torch.as_tensor(self.X, dtype=torch.float32)
        self.y = torch.as_tensor(self.y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def get_scaler(self):
        return self.scaler

In [5]:
target_dates = np.array([datetime.strptime(str(date), '%Y-%m-%d').date() for date in dates])
unique_dates = sorted(np.unique(target_dates))

In [6]:
#用于得到不同的轮次,确保每个轮次为1626天

start_dates = []
starts, valid_starts, test_starts, ends,  = [], [], [], []

i, start, end = 0, 0, 0

k = int(1500*0.8) #按1：1划分训练集和测试集

while i + 1500 + 126 <= len(unique_dates):
    start_dates.append(i)
    
    start = sum(target_dates < unique_dates[i])
    starts.append(start)
    
    valid_start = sum(target_dates < unique_dates[i+k]) #训练集终点
    valid_starts.append(valid_start)
    
    test_start = sum(target_dates < unique_dates[i+1500]) #验证集重点（1500天）
    test_starts.append(test_start)
    
    end = sum(target_dates < unique_dates[i+1500+126]) #测试集终点（再126天）
    ends.append(end)
    
    i += 126

In [7]:
def get_time_range(round_idx):
    # 获取指定轮次的时间区间
    train_start_date = dates[starts[round_idx]]
    valid_start_date = dates[valid_starts[round_idx]]
    test_start_date = dates[test_starts[round_idx]]
    test_end_date = dates[ends[round_idx]]
    
    return {
        "train": (train_start_date, valid_start_date),
        "valid": (valid_start_date, test_start_date),
        "test": (test_start_date, test_end_date)
    }

In [8]:
from Models_v2.AlphaNet_v2 import AlphaNet_v2

In [9]:
def save_model(model, path_name):
    torch.save(model.state_dict(), path_name)

def load_model(model, path_name):
    weights = torch.load(path_name)
    model.load_state_dict(weights)

In [10]:
# 用小量数据测试模型是否能正常工作
net = AlphaNet_v2(d=10, stride=10, n=15)
net(torch.tensor(X[:5]).float())

tensor([[-0.5658],
        [ 0.2655],
        [-1.4989],
        [ 0.9970],
        [ 0.8023]], grad_fn=<AddmmBackward0>)

In [11]:
def compute_RankIC(X, Y, model, target_dates):
    
    results = []
    unique_dates = np.unique(target_dates)
    
    # 针对每个目标日期，对比当天真的股票收益率排名和预测的排名
    for date in unique_dates:
        
        # 获取当日所有股票的信息
        idx = np.where(target_dates==date)[0]
        
        # 当日小于20支股票，跳过该日
        if len(idx) < 20:
            continue
        
        # 预测个股收益率值
        model.eval()
        y_pred = model(torch.tensor(X[idx]).float()).squeeze().detach().numpy()
        y_true = Y[idx]
        
        # 计算排名
        rank_pred = stats.rankdata(y_pred)
        rank_ture = stats.rankdata(y_true)
        
        # 计算排名之间的相关度
        correlation, _ = stats.spearmanr(rank_pred, rank_ture)
        results.append(correlation)
        
    return np.array(results)

In [14]:
import pickle

results = []

model_name = 'alphanet'

cnt = 0

# 使用每个训练区间的最佳模型，来预测对应区间测试集的收益率，计算IC值
for cnt, (start, valid_start, test_start, end) in enumerate(zip(starts, valid_starts, test_starts, ends)):
    
    # 导入模型
    model_path = 'saved_models_v2/' + model_name + '_' + str(cnt) + '.pt'
    net = AlphaNet_v2(d=10, stride=10, n=15)
    net.load_state_dict(torch.load(model_path))
    net.eval()
    
    # 预测 + 验证
    test_res = compute_RankIC(X[test_start:end], Y[test_start:end], net, target_dates[test_start:end])
    
    mean_ic = np.mean(test_res)
    std_ic = np.std(test_res)
    ic_ratio = mean_ic / std_ic
    positive_ratio = np.sum(test_res > 0) / len(test_res)
    
    print(
        f"Model: {model_path} | "
        f"Mean IC: {100 * mean_ic:.2f}% | "
        f"Std IC: {100 * std_ic:.2f}% | "
        f"IC Ratio: {ic_ratio:.4f} | "
        f"Positive Rate: {100 * positive_ratio:.2f}%"
    )
    
    results.append(test_res)

with open('test_results_v2.pickle', 'wb') as f:
    pickle.dump(results, f)
    

Model: saved_models_v2/alphanet_0.pt | Mean IC: 0.96% | Std IC: 12.65% | IC Ratio: 0.0760 | Positive Rate: 50.00%
Model: saved_models_v2/alphanet_1.pt | Mean IC: -1.86% | Std IC: 12.18% | IC Ratio: -0.1527 | Positive Rate: 41.27%
Model: saved_models_v2/alphanet_2.pt | Mean IC: -0.69% | Std IC: 8.16% | IC Ratio: -0.0851 | Positive Rate: 50.00%
Model: saved_models_v2/alphanet_3.pt | Mean IC: 0.19% | Std IC: 7.92% | IC Ratio: 0.0238 | Positive Rate: 53.17%
Model: saved_models_v2/alphanet_4.pt | Mean IC: 0.14% | Std IC: 8.69% | IC Ratio: 0.0158 | Positive Rate: 51.59%
Model: saved_models_v2/alphanet_5.pt | Mean IC: 0.49% | Std IC: 8.27% | IC Ratio: 0.0592 | Positive Rate: 50.00%
Model: saved_models_v2/alphanet_6.pt | Mean IC: -0.74% | Std IC: 10.52% | IC Ratio: -0.0699 | Positive Rate: 44.44%
Model: saved_models_v2/alphanet_7.pt | Mean IC: -0.13% | Std IC: 9.11% | IC Ratio: -0.0148 | Positive Rate: 54.76%
Model: saved_models_v2/alphanet_8.pt | Mean IC: 1.53% | Std IC: 11.41% | IC Ratio: 0.

In [13]:
plt.hist(y_preds, bins=50)
plt.title("Distribution of Model Predictions")
plt.xlabel("Predicted Return")
plt.ylabel("Frequency")

NameError: name 'y_preds' is not defined

In [None]:
time_range = get_time_range(10)
print(f"""
训练集: {time_range['train'][0]} 至 {time_range['train'][1]}
验证集: {time_range['valid'][0]} 至 {time_range['valid'][1]}
测试集: {time_range['test'][0]} 至 {time_range['test'][1]}
""")

In [None]:
net.eval()
part = range(-200,0)
y_preds = net(torch.tensor(X[part]).float()).squeeze().detach().numpy()

In [None]:
y_real = (torch.tensor(Y[part]).float()).squeeze().detach().numpy()

In [None]:
y_real