In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm, t

In [None]:
import os
os.chdir('d:/future/Index_Future_Prediction')

import yaml
with open('config.yaml', 'r') as file:
    token = yaml.safe_load(file)['token']

import tushare as ts
pro = ts.pro_api(token)

In [None]:
future_list = pro.fut_basic(fut_type = 2)
future_list

In [None]:
start_date = '20170901'
end_date = '20230901'
data = pro.fut_daily(ts_code = 'IF.CFX')
data = data[data['trade_date'] > start_date].copy()
data = data[data['trade_date'] < end_date].copy()
data

In [None]:
data = pro.fut_daily(ts_code = 'TL1.CFX')
data = data[data['trade_date'] > start_date].copy()
data = data[data['trade_date'] < end_date].copy()
data

In [None]:
import matplotlib.pyplot as plt
plt.plot(data['close'])
plt.show()

In [None]:
# assets_list = []
# assets_names = []
# for i in future_list.index:
#     assets_code = future_list.loc[i, 'ts_code']
#     assets_name = future_list.loc[i, 'name']

#     if "主力" not in assets_name:
#         continue

#     data = pro.fut_daily(ts_code = assets_code)
#     data_head = data.head(100)

#     if len(data) < 1000:
#         continue
#     if data_head['oi'].mean() < 50000:
#         continue
#     assets_list.append(assets_code)
#     assets_names.append(assets_name)

# len(assets_list)

In [None]:
assets_list = [
    # 股指期货
    'IH.CFX', 'IF.CFX', 'IC.CFX',
    # 国债期货
    'TS.CFX', 'TF.CFX', 'T.CFX', 'TL1.CFX',
    # 黑色金属产业链
    'I.DCE', 'JM.DCE', 'RB.SHF', 'HC.SHF', 'SS.SHF', 'SF.ZCE', 'SM.ZCE',
    # 有色金属
    'CU.SHF', 'AL.SHF', 'ZN.SHF', 'NI.SHF',
    # 贵金属
    'AU.SHF', 'AG.SHF',
    # 能源化工
    'FU.SHF', 'LU.INE', 'BU.SHF', 'PG.DCE', 'TA.ZCE', 'EG.DCE', 'PF.ZCE', 
    'L.DCE', 'PP.DCE', 'V.DCE', 'EB.DCE', 'MA.ZCE', 'UR.ZCE', 'RU.SHF',
    # 农产品
    'A.DCE', 'B.DCE', 'M.DCE', 'RM.ZCE', 'Y.DCE', 'OI.ZCE', 'P.DCE', 'PK.ZCE',
    'C.DCE', 'CS.DCE', 'CF.ZCE', 'SR.ZCE', 'CJ.ZCE', 'AP.ZCE', 'SP.SHF', 
    'JD.DCE', 'LH.DCE',
    # 建材
    'FG.ZCE', 'SA.ZCE'
]

In [None]:
# assets_list = ['IH.CFX', 'IF.CFX', 'IC.CFX', 'AU.SHF', 'JM.DCE','RB.SHF','HC.SHF', 'I.DCE', 'M.DCE', 'CF.ZCE',]

In [None]:
data = pro.fut_daily(ts_code = 'SF.ZCE', start_date = '20100101', end_date = '20180101')
data

In [None]:
pred_len = 5
threshold_ratio = 0.26

In [None]:
for asset_code in assets_list:

    data_1 = pro.fut_daily(ts_code = asset_code, start_date = '20100101', end_date = '20180101', fields = 'ts_code,trade_date,pre_close,open,high,low,close,vol')
    data_2 = pro.fut_daily(ts_code = asset_code, start_date = '20180101', fields = 'ts_code,trade_date,pre_close,open,high,low,close,vol')
    data = pd.concat([data_1, data_2], ignore_index = True)

    data.sort_values(by = 'trade_date', inplace = True)

    data['open'] = data['open'].fillna(data['pre_close'])
    data['high'] = data['high'].fillna(data['pre_close'])
    data['low'] = data['low'].fillna(data['pre_close'])
    data['close'] = data['close'].fillna(data['pre_close'])

    
    data['vol'] = data['vol'].replace(0, 1)

    data.dropna(inplace=True)
    

    # 日内相对价格
    data['inday_chg_open'] = np.log(data['open'] / data['pre_close']) * 100 
    data['inday_chg_high'] = np.log(data['high'] / data['pre_close']) * 100 
    data['inday_chg_low'] = np.log(data['low'] / data['pre_close']) * 100 
    data['inday_chg_close'] = np.log(data['close'] / data['pre_close']) * 100
    data['inday_chg_amplitude'] = np.log(data['high'] / data['low']) * 100

    
    # 辅助变量 加强长距离信息传递
    data['ma_10'] = np.log(data['close'] / data['close'].rolling(window = 10).mean()) / 10**0.5 * 100
    data['ma_26'] = np.log(data['close'] / data['close'].rolling(window = 26).mean()) / 26**0.5 * 100
    data['ma_45'] = np.log(data['close'] / data['close'].rolling(window = 45).mean()) / 45**0.5 * 100
    data['ma_90'] = np.log(data['close'] / data['close'].rolling(window = 90).mean()) / 90**0.5 * 100
    data['ma_vol'] = np.log(data['vol'] / data['vol'].rolling(window = 90).mean())
    

    # 预测目标
    data['label_return'] = data['inday_chg_close'].rolling(window = pred_len).sum().shift(-pred_len) # 标准化为对数百分比（不含百分号），可以直接相加
    data['return_std_hist'] = data['label_return'].rolling(window = 250).std() # 过去一年的预测目标的标准差

    # Garman-Klass 方差计算估测目标的真实方差
    data['label_garman_klass_variance'] = (0.5 * (np.log(data['high']) - np.log(data['low'])) ** 2 - 0.386 * (np.log(data['close']) - np.log(data['open'])) ** 2)
    # 计算10日移动方差
    data['label_garman_klass_variance_pred'] = data['label_garman_klass_variance'].rolling(window = pred_len).sum().shift(-pred_len)
    # 计算10日移动标准差
    data['label_garman_klass_std_pred'] = data['label_garman_klass_variance_pred']**0.5
    # 再根据真实标准差均值来矫正预期
    data['label_garman_klass_std_hist'] = data['label_garman_klass_std_pred'].rolling(window = 250).mean()
    
    # 估计的标准差
    data['label_std'] = data['label_garman_klass_std_pred'] / data['label_garman_klass_std_hist'] * data['return_std_hist']

    data['label_std'] = data['label_std'].replace(0, data['label_std'].mean())


    # print(data['label_std'].mean(), data['return_std_hist'].mean())
    # 振幅估计方法（已废弃）
    # data['average_log_amplitude'] = data['log_amplitude'].rolling(window = pred_len).mean().shift(-pred_len)
    # data['label_pred_high'] = data['high'].rolling(window = pred_len).max().shift(-pred_len)
    # data['label_pred_low'] = data['low'].rolling(window = pred_len).min().shift(-pred_len)
    # data['label_amplitude'] = data['label_pred_high'] - data['label_pred_low']
    # data['label_amplitude_ma'] = data['label_amplitude'].rolling(window = 250).mean()
    # data['label_std'] = data['label_amplitude'] / data['label_amplitude_ma'] * data['ma_return_std'] # 用预测目标的振幅作为波动率的辅助变量

    # 阈值分位数
    data['upper_bond'] = data['label_return'].rolling(window = 250).quantile(1 - threshold_ratio) # 过去一年的收益下分位数
    data['lower_bond'] = data['label_return'].rolling(window = 250).quantile(threshold_ratio) # 过去一年的收益上分位数
    data['threshold'] = (abs(data['upper_bond']) + abs(data['lower_bond']))/2 # 过去一年的收益的分割阈值

    def down_probability(row):
        return norm.cdf(row['lower_bond'], loc = row['label_return'], scale=row['label_std'])

    def middle_probability(row):
        return norm.cdf(row['upper_bond'], loc = row['label_return'], scale=row['label_std']) - norm.cdf(row['lower_bond'], loc = row['label_return'], scale=row['label_std'])

    def up_probability(row):
        return 1 - norm.cdf(row['upper_bond'], loc = row['label_return'], scale=row['label_std'])

    data['down_prob'] = data.apply(down_probability, axis = 1)
    data['middle_prob'] = data.apply(middle_probability, axis = 1)
    data['up_prob'] = data.apply(up_probability, axis = 1)


    data.dropna(inplace=True)
    print(asset_code, len(data))
    data.to_csv(f'data/{asset_code}.csv')

In [None]:
data = pd.read_csv('data/SF.ZCE.csv')
data

In [None]:
data = pd.read_csv('data/IH.CFX.csv')
data[data['trade_date'] > 20210101]

In [None]:
data = pd.read_csv('data/LU.INE.csv')
data[data['trade_date'] > 20210101]

In [None]:
data = pro.fut_daily(ts_code = 'IH.CFX', start_date = '20180101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol, amount')
data[data['trade_date'] > '20210101']

In [None]:
data = pro.fut_daily(ts_code = 'LU.INE', start_date = '20180101', fields = 'ts_code,trade_date,pre_close,pre_settle,open,high,low,close,settle,vol,amount')
data[data['trade_date'] > '20210101']

In [None]:
data.describe()

In [None]:
data.describe().loc[['mean', 'std'],['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]]

In [None]:
data.describe().loc[['mean', 'std'],['label_return','down_prob','middle_prob','up_prob',]]

In [None]:
import torch
feature = []
label = []
feature_columns = ['inday_chg_open','inday_chg_high','inday_chg_low','inday_chg_close','inday_chg_amplitude', 'ma_10','ma_26','ma_45','ma_90','ma_vol',]
label_columns = ['label_return','down_prob','middle_prob','up_prob',]

for asset_code in assets_list:
    data = pd.read_csv(f'{asset_code}.csv')
    feature.append(torch.tensor(data[feature_columns].values, dtype = torch.float32, device = 'cuda:0'))
    label.append(torch.tensor(data[label_columns].values, dtype = torch.float32, device = 'cuda:0'))

feature = torch.stack(feature, dim = 1)
label = torch.stack(label, dim = 1)

feature.shape, label.shape

In [None]:
seq_len = 30
feature = feature.unfold(dimension = 0, size = seq_len, step = 1).permute(0,1,3,2)
label = label[seq_len-1:]

feature.shape, label.shape