In [84]:
import numpy as np
import pandas as pd
import tushare as ts
from scipy.stats import norm, t
pro = ts.pro_api('700c1d6015ad030ff20bf310c088243da030e6b79a2a1098d58d2614')

In [85]:
assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]

In [86]:
data = pro.fut_daily(ts_code = 'IH.CFX', start_date = '20160101', end_date = '20200101')
data.sort_values(by = 'trade_date', inplace = True)
data['diff'] = data['close'] - data['close'].shift(10)
data

Unnamed: 0,ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,change1,change2,vol,amount,oi,oi_chg,diff
974,IH.CFX,20160104,2388.8,2403.6,2374.0,2388.4,2240.2,2254.0,2291.2,-149.6,-112.4,4786.0,334657.416,10650.0,,
973,IH.CFX,20160105,2254.0,2291.2,2250.2,2299.0,2211.8,2261.0,2254.2,-30.2,-37.0,7003.0,475842.732,11654.0,,
972,IH.CFX,20160106,2261.0,2254.2,2260.0,2303.4,2256.2,2288.0,2290.4,33.8,36.2,6342.0,433303.854,11130.0,,
971,IH.CFX,20160107,2288.0,2290.4,2268.0,2268.2,2140.0,2145.8,2220.8,-144.6,-69.6,1951.0,129990.540,10507.0,,
970,IH.CFX,20160108,2145.8,2220.8,2200.0,2244.8,2135.8,2194.2,2207.0,-26.6,-13.8,7394.0,487736.742,11371.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,IH.CFX,20191225,3003.0,3004.4,3001.6,3002.0,2982.6,2996.8,2989.6,-7.6,-14.8,18595.0,1670048.238,27489.0,,46.4
3,IH.CFX,20191226,2996.8,2989.6,2997.8,3022.6,2994.4,3017.8,3015.0,28.2,25.4,21835.0,1971234.180,28052.0,,78.8
2,IH.CFX,20191227,3017.8,3015.0,3020.2,3060.0,3020.2,3027.6,3031.8,12.6,16.8,29000.0,2646357.612,29979.0,,21.4
1,IH.CFX,20191230,3027.6,3031.8,3026.6,3078.0,3017.8,3067.8,3066.8,36.0,35.0,29123.0,2665393.578,29672.0,,68.2


In [87]:
pred_len = 5
threshold_ratio = 0.26

In [88]:
for asset_code in assets_list:

    data_1 = pro.fut_daily(ts_code = asset_code, start_date = '20160101', end_date = '20200101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol, amount,oi')
    data_2 = pro.fut_daily(ts_code = asset_code, start_date = '20180101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol, amount,oi')
    data = pd.concat([data_1, data_2], ignore_index = True)

    data['oi_chg'] = 1
    data.dropna(inplace=True)
    data.sort_values(by = 'trade_date', inplace = True)
    

    # 日内相对价格
    data['inday_chg_open'] = np.log(data['open'] / data['pre_close']) * 100 
    data['inday_chg_high'] = np.log(data['high'] / data['pre_close']) * 100 
    data['inday_chg_low'] = np.log(data['low'] / data['pre_close']) * 100 
    data['inday_chg_close'] = np.log(data['close'] / data['pre_close']) * 100
    data['inday_chg_amplitude'] = np.log(data['high'] / data['low']) * 100

    # 辅助变量 加强长距离信息传递
    data['ma_10'] = np.log(data['close'] / data['close'].rolling(window = 10).mean()) / 10**0.5 * 100
    data['ma_26'] = np.log(data['close'] / data['close'].rolling(window = 26).mean()) / 26**0.5 * 100
    data['ma_45'] = np.log(data['close'] / data['close'].rolling(window = 45).mean()) / 45**0.5 * 100
    data['ma_90'] = np.log(data['close'] / data['close'].rolling(window = 90).mean()) / 90**0.5 * 100
    data['ma_vol'] = np.log(data['vol'] / data['vol'].rolling(window = 90).mean())


    # 预测目标
    data['label_return'] = data['inday_chg_close'].rolling(window = pred_len).sum().shift(-pred_len) # 标准化为对数百分比（不含百分号），可以直接相加
    data['return_std_hist'] = data['label_return'].rolling(window = 250).std() # 过去一年的预测目标的标准差

    # Garman-Klass 方差计算估测目标的真实方差
    data['label_garman_klass_variance'] = (0.5 * (np.log(data['high']) - np.log(data['low'])) ** 2 - 0.386 * (np.log(data['close']) - np.log(data['open'])) ** 2)
    # 计算10日移动方差
    data['label_garman_klass_variance_pred'] = data['label_garman_klass_variance'].rolling(window = pred_len).sum().shift(-pred_len)
    # 计算10日移动标准差
    data['label_garman_klass_std_pred'] = data['label_garman_klass_variance_pred']**0.5
    # 再根据真实标准差均值来矫正预期
    data['label_garman_klass_std_hist'] = data['label_garman_klass_std_pred'].rolling(window = 250).mean()


    # 估计的标准差
    data['label_std'] = data['label_garman_klass_std_pred'] / data['label_garman_klass_std_hist'] * data['return_std_hist']
    print(data['label_std'].mean(), data['return_std_hist'].mean())
    # 振幅估计方法（已废弃）
    # data['average_log_amplitude'] = data['log_amplitude'].rolling(window = pred_len).mean().shift(-pred_len)
    # data['label_pred_high'] = data['high'].rolling(window = pred_len).max().shift(-pred_len)
    # data['label_pred_low'] = data['low'].rolling(window = pred_len).min().shift(-pred_len)
    # data['label_amplitude'] = data['label_pred_high'] - data['label_pred_low']
    # data['label_amplitude_ma'] = data['label_amplitude'].rolling(window = 250).mean()
    # data['label_std'] = data['label_amplitude'] / data['label_amplitude_ma'] * data['ma_return_std'] # 用预测目标的振幅作为波动率的辅助变量

    # 阈值分位数
    data['upper_bond'] = data['label_return'].rolling(window = 250).quantile(1 - threshold_ratio) # 过去一年的收益下分位数
    data['lower_bond'] = data['label_return'].rolling(window = 250).quantile(threshold_ratio) # 过去一年的收益上分位数
    data['threshold'] = (abs(data['upper_bond']) + abs(data['lower_bond']))/2 # 过去一年的收益的分割阈值

    def down_probability(row):
        return norm.cdf(row['lower_bond'], loc = row['label_return'], scale=row['label_std'])

    def middle_probability(row):
        return norm.cdf(row['upper_bond'], loc = row['label_return'], scale=row['label_std']) - norm.cdf(row['lower_bond'], loc = row['label_return'], scale=row['label_std'])

    def up_probability(row):
        return 1 - norm.cdf(row['upper_bond'], loc = row['label_return'], scale=row['label_std'])
        
    data['down_prob'] = data.apply(down_probability, axis = 1)
    data['middle_prob'] = data.apply(middle_probability, axis = 1)
    data['up_prob'] = data.apply(up_probability, axis = 1)


    data.dropna(inplace=True)
    print(asset_code, len(data))
    data.to_csv(f'{asset_code}.csv')

3.1564118080286367 3.123860978206624
IH.CFX 2603
3.187861118779522 3.1554935503184676
IF.CFX 2603
3.6425484397401227 3.6169649644012196
IC.CFX 2603
1.8529414024997954 1.8035651440405787
AU.SHF 2603
5.3094295632382185 5.43040290194594
JM.DCE 2603
3.6325758788389617 3.817356768111425
RB.SHF 2603
3.472534044353281 3.6342863339811444
HC.SHF 2603
5.731389297372083 5.910619405356468
I.DCE 2603
3.2132504130206394 3.278716125213049
M.DCE 2603
2.98679967956071 3.0119671402559454
CF.ZCE 2603


In [96]:
data = pd.read_csv('IC.CFX.csv')
data

Unnamed: 0.1,Unnamed: 0,ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,...,label_garman_klass_variance_pred,label_garman_klass_std_pred,label_garman_klass_std_hist,label_std,upper_bond,lower_bond,threshold,down_prob,middle_prob,up_prob
0,725,IC.CFX,20170110,6373.8,6368.8,6375.0,6388.6,6343.2,6344.6,6358.4,...,0.001440,0.037948,0.031431,4.822771,2.104474,-1.690117,1.897296,0.720413,0.194380,0.085207
1,724,IC.CFX,20170111,6344.6,6358.4,6339.2,6382.2,6295.0,6297.6,6310.6,...,0.001421,0.037692,0.031289,4.689206,2.104474,-1.690117,1.897296,0.659545,0.229306,0.111149
2,723,IC.CFX,20170112,6297.6,6310.6,6302.4,6321.2,6233.0,6238.4,6256.0,...,0.001424,0.037737,0.031147,4.666667,2.104474,-1.690117,1.897296,0.688161,0.215683,0.096156
3,722,IC.CFX,20170113,6238.4,6256.0,6245.2,6267.4,6170.8,6178.4,6196.8,...,0.001467,0.038296,0.030982,4.629389,2.104474,-1.608854,1.856664,0.495740,0.289917,0.214343
4,721,IC.CFX,20170116,6178.4,6196.8,6171.8,6190.0,5874.8,6011.4,5984.4,...,0.000417,0.020419,0.030734,2.482042,2.104474,-1.578517,1.841496,0.093562,0.471856,0.434581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,984,IC.CFX,20250917,7165.2,7162.0,7151.0,7263.4,7128.0,7252.4,7244.2,...,0.001569,0.039612,0.028831,5.974695,2.434802,-0.941787,1.688295,0.525514,0.209859,0.264627
2599,983,IC.CFX,20250918,7252.4,7244.2,7238.2,7351.6,7099.2,7171.6,7164.6,...,0.001049,0.032387,0.028853,4.877910,2.434802,-0.881487,1.658145,0.433916,0.262264,0.303819
2600,982,IC.CFX,20250919,7171.6,6960.2,6995.6,7036.8,6924.0,6984.0,6999.0,...,0.001019,0.031915,0.028866,4.803364,2.434802,-0.881487,1.658145,0.319987,0.268119,0.411894
2601,981,IC.CFX,20250922,6984.0,6999.0,7007.0,7019.8,6925.2,7013.2,6995.0,...,0.001183,0.034390,0.028876,5.176671,2.440780,-0.881487,1.661134,0.222351,0.228900,0.548748


In [97]:
data.describe()

Unnamed: 0.1,Unnamed: 0,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol,...,label_garman_klass_variance_pred,label_garman_klass_std_pred,label_garman_klass_std_hist,label_std,upper_bond,lower_bond,threshold,down_prob,middle_prob,up_prob
count,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,...,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0
mean,1484.157126,20205020.0,5704.172493,5701.709105,5704.320937,5755.070688,5650.919478,5704.001844,5703.543673,47239.064925,...,0.000812,0.026246,0.026177,3.642548,2.029141,-1.964298,1.996719,0.325771,0.344916,0.329313
std,843.121902,24623.96,749.56988,748.955159,749.170595,747.147594,748.944463,749.809911,749.341074,28888.735258,...,0.000868,0.011108,0.004686,1.863603,0.801982,0.871984,0.751449,0.261419,0.160085,0.255533
min,0.0,20170110.0,4033.2,4014.6,3949.0,4111.4,3937.8,4033.2,4047.8,2836.0,...,8.5e-05,0.009194,0.017364,0.988341,0.840944,-4.193842,1.006922,6e-06,0.001749,2.9e-05
25%,650.5,20181110.0,5057.8,5056.4,5061.8,5119.8,5002.2,5057.8,5056.4,17274.0,...,0.000345,0.018565,0.022457,2.304924,1.382863,-2.805839,1.386399,0.101946,0.231478,0.110129
50%,1555.0,20200520.0,5802.4,5805.2,5806.0,5863.8,5738.8,5802.0,5805.2,48478.0,...,0.000562,0.023697,0.025911,3.255488,1.81424,-1.662108,1.749966,0.260912,0.341489,0.274122
75%,2205.5,20230120.0,6269.8,6262.9,6262.2,6308.5,6214.5,6269.3,6264.3,65852.5,...,0.000938,0.030626,0.029649,4.48475,2.551911,-1.291224,2.51987,0.506474,0.458481,0.505765
max,2856.0,20250920.0,7596.4,7604.2,7608.8,7680.2,7534.2,7596.4,7604.2,168648.0,...,0.010654,0.10322,0.035273,15.602935,4.685099,-0.745814,3.808927,0.995891,0.8336,0.997783


In [98]:
data.describe().loc[['mean', 'std'],['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]]

Unnamed: 0,inday_chg_open,inday_chg_high,inday_chg_low,inday_chg_close,inday_chg_amplitude,ma_10,ma_26,ma_45,ma_90,ma_vol
mean,0.003495,0.909819,-0.953336,-0.00347,1.863155,0.001757,0.003655,0.001294,-0.008078,0.004499
std,0.648051,1.061278,1.146044,1.474431,1.060404,0.682045,0.693745,0.697633,0.667315,0.292362


In [99]:
data.describe().loc[['mean', 'std'],['label_return','down_prob','middle_prob','up_prob',]]

Unnamed: 0,label_return,down_prob,middle_prob,up_prob
mean,-0.005795,0.325771,0.344916,0.329313
std,3.827031,0.261419,0.160085,0.255533


In [93]:
import torch
feature = []
label = []
feature_columns = ['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]
label_columns = ['label_return','down_prob','middle_prob','up_prob',]

for asset_code in assets_list:
    data = pd.read_csv(f'{asset_code}.csv')
    feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
    label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

feature = torch.stack(feature, dim = 1)
label = torch.stack(label, dim = 1)

feature.shape, label.shape

(torch.Size([2603, 10, 10]), torch.Size([2603, 10, 4]))

In [94]:
seq_len = 30
feature = feature.unfold(dimension = 0, size = seq_len, step = 1).permute(0,1,3,2)
label = label[seq_len-1:]

feature.shape, label.shape

(torch.Size([2574, 10, 30, 10]), torch.Size([2574, 10, 4]))