In [24]:
import numpy as np
import pandas as pd
import tushare as ts
from scipy.stats import norm, t
pro = ts.pro_api('700c1d6015ad030ff20bf310c088243da030e6b79a2a1098d58d2614')

In [25]:
assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]

In [26]:
data = pro.fut_daily(ts_code = 'IH.CFX', start_date = '20160101', end_date = '20200101')
data.sort_values(by = 'trade_date', inplace = True)
data['diff'] = data['close'] - data['close'].shift(10)
data

Unnamed: 0,ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,change1,change2,vol,amount,oi,oi_chg,diff
974,IH.CFX,20160104,2388.8,2403.6,2374.0,2388.4,2240.2,2254.0,2291.2,-149.6,-112.4,4786.0,334657.416,10650.0,,
973,IH.CFX,20160105,2254.0,2291.2,2250.2,2299.0,2211.8,2261.0,2254.2,-30.2,-37.0,7003.0,475842.732,11654.0,,
972,IH.CFX,20160106,2261.0,2254.2,2260.0,2303.4,2256.2,2288.0,2290.4,33.8,36.2,6342.0,433303.854,11130.0,,
971,IH.CFX,20160107,2288.0,2290.4,2268.0,2268.2,2140.0,2145.8,2220.8,-144.6,-69.6,1951.0,129990.540,10507.0,,
970,IH.CFX,20160108,2145.8,2220.8,2200.0,2244.8,2135.8,2194.2,2207.0,-26.6,-13.8,7394.0,487736.742,11371.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,IH.CFX,20191225,3003.0,3004.4,3001.6,3002.0,2982.6,2996.8,2989.6,-7.6,-14.8,18595.0,1670048.238,27489.0,,46.4
3,IH.CFX,20191226,2996.8,2989.6,2997.8,3022.6,2994.4,3017.8,3015.0,28.2,25.4,21835.0,1971234.180,28052.0,,78.8
2,IH.CFX,20191227,3017.8,3015.0,3020.2,3060.0,3020.2,3027.6,3031.8,12.6,16.8,29000.0,2646357.612,29979.0,,21.4
1,IH.CFX,20191230,3027.6,3031.8,3026.6,3078.0,3017.8,3067.8,3066.8,36.0,35.0,29123.0,2665393.578,29672.0,,68.2


In [27]:
pred_len = 5
threshold_ratio = 0.25

In [28]:
for asset_code in assets_list:

    data_1 = pro.fut_daily(ts_code = asset_code, start_date = '20160101', end_date = '20200101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol, amount,oi')
    data_2 = pro.fut_daily(ts_code = asset_code, start_date = '20180101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol, amount,oi')
    data = pd.concat([data_1, data_2], ignore_index = True)

    data['oi_chg'] = 1
    data.dropna(inplace=True)
    data.sort_values(by = 'trade_date', inplace = True)
    

    # 日内相对价格
    data['inday_chg_open'] = np.log(data['open'] / data['pre_close']) * 100 
    data['inday_chg_high'] = np.log(data['high'] / data['pre_close']) * 100 
    data['inday_chg_low'] = np.log(data['low'] / data['pre_close']) * 100 
    data['inday_chg_close'] = np.log(data['close'] / data['pre_close']) * 100
    data['inday_chg_amplitude'] = np.log(data['high'] / data['low']) * 100

    # 辅助变量 加强长距离信息传递
    data['ma_10'] = np.log(data['close'] / data['close'].rolling(window = 10).mean()) / 10**0.5 * 100
    data['ma_26'] = np.log(data['close'] / data['close'].rolling(window = 26).mean()) / 26**0.5 * 100
    data['ma_45'] = np.log(data['close'] / data['close'].rolling(window = 45).mean()) / 45**0.5 * 100
    data['ma_90'] = np.log(data['close'] / data['close'].rolling(window = 90).mean()) / 90**0.5 * 100
    data['ma_vol'] = np.log(data['vol'] / data['vol'].rolling(window = 90).mean())


    # 预测目标
    data['label_return'] = data['inday_chg_close'].rolling(window = pred_len).sum().shift(-pred_len) # 标准化为对数百分比（不含百分号），可以直接相加
    data['return_std_hist'] = data['label_return'].rolling(window = 250).std() # 过去一年的预测目标的标准差

    # Garman-Klass 方差计算估测目标的真实方差
    data['label_garman_klass_variance'] = (0.5 * (np.log(data['high']) - np.log(data['low'])) ** 2 - 0.386 * (np.log(data['close']) - np.log(data['open'])) ** 2)
    # 计算10日移动方差
    data['label_garman_klass_variance_pred'] = data['label_garman_klass_variance'].rolling(window = pred_len).sum().shift(-pred_len)
    # 计算10日移动标准差
    data['label_garman_klass_std_pred'] = data['label_garman_klass_variance_pred']**0.5
    # 再根据真实标准差均值来矫正预期
    data['label_garman_klass_std_hist'] = data['label_garman_klass_std_pred'].rolling(window = 250).mean()


    # 估计的标准差
    data['label_std'] = data['label_garman_klass_std_pred'] / data['label_garman_klass_std_hist'] * data['return_std_hist']
    print(data['label_std'].mean(), data['return_std_hist'].mean())
    # 振幅估计方法（已废弃）
    # data['average_log_amplitude'] = data['log_amplitude'].rolling(window = pred_len).mean().shift(-pred_len)
    # data['label_pred_high'] = data['high'].rolling(window = pred_len).max().shift(-pred_len)
    # data['label_pred_low'] = data['low'].rolling(window = pred_len).min().shift(-pred_len)
    # data['label_amplitude'] = data['label_pred_high'] - data['label_pred_low']
    # data['label_amplitude_ma'] = data['label_amplitude'].rolling(window = 250).mean()
    # data['label_std'] = data['label_amplitude'] / data['label_amplitude_ma'] * data['ma_return_std'] # 用预测目标的振幅作为波动率的辅助变量

    # 阈值分位数
    data['upper_bond'] = data['label_return'].rolling(window = 250).quantile(1 - threshold_ratio) # 过去一年的收益下分位数
    data['lower_bond'] = data['label_return'].rolling(window = 250).quantile(threshold_ratio) # 过去一年的收益上分位数
    data['threshold'] = (abs(data['upper_bond']) + abs(data['lower_bond']))/2 # 过去一年的收益的分割阈值

    def down_probability(row):
        return norm.cdf(-row['threshold'], loc = row['label_return'], scale=row['label_std'])

    def middle_probability(row):
        return norm.cdf(row['threshold'], loc = row['label_return'], scale=row['label_std']) - norm.cdf(-row['threshold'], loc = row['label_return'], scale=row['label_std'])

    def up_probability(row):
        return 1 - norm.cdf(row['threshold'], loc = row['label_return'], scale=row['label_std'])
        
    data['down_prob'] = data.apply(down_probability, axis = 1)
    data['middle_prob'] = data.apply(middle_probability, axis = 1)
    data['up_prob'] = data.apply(up_probability, axis = 1)


    data.dropna(inplace=True)
    print(asset_code, len(data))
    data.to_csv(f'{asset_code}.csv')

3.1564118080286367 3.123860978206624
IH.CFX 2603
3.187861118779522 3.1554935503184676
IF.CFX 2603
3.6425484397401227 3.6169649644012196
IC.CFX 2603
1.8529414024997954 1.8035651440405787
AU.SHF 2603
5.3094295632382185 5.43040290194594
JM.DCE 2603
3.6325758788389617 3.817356768111425
RB.SHF 2603
3.472534044353281 3.6342863339811444
HC.SHF 2603
5.731389297372083 5.910619405356468
I.DCE 2603
3.2132504130206394 3.278716125213049
M.DCE 2603
2.98679967956071 3.0119671402559454
CF.ZCE 2603


In [29]:
data = pd.read_csv('IH.CFX.csv')
data

Unnamed: 0.1,Unnamed: 0,ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,...,label_garman_klass_variance_pred,label_garman_klass_std_pred,label_garman_klass_std_hist,label_std,upper_bond,lower_bond,threshold,down_prob,middle_prob,up_prob
0,725,IH.CFX,20170110,2312.2,2313.0,2312.2,2317.4,2305.2,2307.4,2311.2,...,0.000286,0.016909,0.022491,1.916339,1.417117,-1.099929,1.258523,0.143049,0.454399,0.402552
1,724,IH.CFX,20170111,2307.4,2311.2,2310.2,2315.2,2296.4,2298.6,2301.6,...,0.000515,0.022703,0.022361,2.554750,1.443948,-1.084535,1.264241,0.100778,0.286120,0.613103
2,723,IH.CFX,20170112,2298.6,2301.6,2299.0,2305.8,2286.2,2290.8,2294.4,...,0.000517,0.022747,0.022252,2.539499,1.456196,-1.074567,1.265382,0.121043,0.310193,0.568765
3,722,IH.CFX,20170113,2290.8,2294.4,2291.2,2315.0,2290.2,2305.0,2304.6,...,0.000484,0.021997,0.022134,2.408489,1.469342,-1.055678,1.262510,0.092846,0.298794,0.608359
4,721,IH.CFX,20170116,2305.0,2304.6,2302.0,2332.8,2285.0,2325.2,2316.0,...,0.000347,0.018627,0.022013,2.050490,1.469342,-1.055678,1.262510,0.143063,0.422372,0.434565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,984,IH.CFX,20250917,2950.6,2950.4,2951.2,2962.6,2935.0,2956.2,2953.6,...,0.000562,0.023716,0.023169,3.355750,1.445473,-0.970611,1.208042,0.423004,0.277472,0.299524
2599,983,IH.CFX,20250918,2956.2,2953.6,2965.6,2976.0,2888.4,2910.8,2907.8,...,0.000279,0.016692,0.023178,2.357174,1.448386,-0.962091,1.205239,0.129121,0.327899,0.542980
2600,982,IH.CFX,20250919,2910.8,2903.4,2917.4,2932.0,2904.4,2913.4,2917.4,...,0.000264,0.016258,0.023185,2.294588,1.448386,-0.962091,1.205239,0.161938,0.363570,0.474492
2601,981,IH.CFX,20250922,2913.4,2917.4,2914.2,2930.0,2902.0,2923.0,2917.4,...,0.000607,0.024646,0.023207,3.475875,1.457072,-0.962091,1.209582,0.193526,0.239377,0.567097


In [30]:
data.describe()

Unnamed: 0.1,Unnamed: 0,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol,...,label_garman_klass_variance_pred,label_garman_klass_std_pred,label_garman_klass_std_hist,label_std,upper_bond,lower_bond,threshold,down_prob,middle_prob,up_prob
count,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,...,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0,2603.0
mean,1484.157126,20205020.0,2777.415521,2777.036189,2778.468383,2800.349212,2756.473454,2777.729927,2777.554752,28874.480599,...,0.000587,0.022324,0.022296,3.156412,1.871576,-1.756862,1.814219,0.308048,0.360005,0.331946
std,843.121902,24623.96,336.557854,336.043577,337.095642,339.962083,332.612033,336.492251,336.003339,14012.605209,...,0.000697,0.009398,0.003217,1.648481,0.655918,0.721822,0.593452,0.253777,0.161591,0.259459
min,0.0,20170110.0,2200.2,2203.0,2193.0,2226.0,2128.6,2200.2,2206.6,2974.0,...,6e-05,0.007753,0.015911,0.814053,0.837249,-3.745993,0.968724,1.6e-05,0.002864,2.1e-05
25%,650.5,20181110.0,2507.1,2507.9,2509.8,2530.1,2488.9,2507.6,2508.0,17469.0,...,0.000262,0.016191,0.019228,2.086444,1.327978,-1.936338,1.431545,0.082328,0.24121,0.106938
50%,1555.0,20200520.0,2726.6,2726.2,2729.4,2751.0,2706.0,2726.6,2726.2,29099.0,...,0.000423,0.020572,0.022779,2.75372,1.750824,-1.68587,1.607968,0.247303,0.357809,0.281547
75%,2205.5,20230120.0,2943.1,2944.2,2946.8,2964.3,2925.8,2943.2,2944.3,38302.0,...,0.000666,0.0258,0.024944,3.747515,2.280733,-1.364517,2.178523,0.488992,0.469631,0.513266
max,2856.0,20250920.0,4020.4,4020.4,4111.4,4115.0,3978.6,4020.4,4020.4,113039.0,...,0.011727,0.108293,0.027831,17.293294,3.540394,-0.575689,3.242867,0.985889,0.905178,0.996462


In [31]:
data.describe().loc[['mean', 'std'],['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]]

Unnamed: 0,inday_chg_open,inday_chg_high,inday_chg_low,inday_chg_close,inday_chg_amplitude,ma_10,ma_26,ma_45,ma_90,ma_vol
mean,0.036266,0.820081,-0.751854,0.01169,1.571935,0.01005,0.018507,0.022986,0.031057,0.008478
std,0.632646,0.987578,1.002379,1.279592,0.911914,0.588265,0.570456,0.559431,0.541272,0.259278


In [32]:
data.describe().loc[['mean', 'std'],['label_return','down_prob','middle_prob','up_prob',]]

Unnamed: 0,label_return,down_prob,middle_prob,up_prob
mean,0.061337,0.308048,0.360005,0.331946
std,3.260663,0.253777,0.161591,0.259459


In [33]:
import torch
feature = []
label = []
feature_columns = ['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]
label_columns = ['label_return','down_prob','middle_prob','up_prob',]

for asset_code in assets_list:
    data = pd.read_csv(f'{asset_code}.csv')
    feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
    label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

feature = torch.stack(feature, dim = 1)
label = torch.stack(label, dim = 1)

feature.shape, label.shape

(torch.Size([2603, 10, 10]), torch.Size([2603, 10, 4]))

In [34]:
seq_len = 30
feature = feature.unfold(dimension = 0, size = seq_len, step = 1).permute(0,1,3,2)
label = label[seq_len-1:]

feature.shape, label.shape

(torch.Size([2574, 10, 30, 10]), torch.Size([2574, 10, 4]))