In [2]:
import numpy as np
import pandas as pd

In [49]:
import tushare as ts
ts.set_token('5eba336d44874ed58899ff7c22fd4f9042413484239d77fc884faa4e')
pro = ts.pro_api()

# 沪深300
df_300 = pro.index_daily(ts_code='000300.SH',start_date='20100101', end_date='20210303')
df_SH = pro.index_daily(ts_code='000001.SH',start_date='20100101', end_date='20210303')

In [51]:
source_data=pd.merge(df_SH,df_300,on="trade_date")

In [52]:
source_data.columns

Index(['ts_code_x', 'trade_date', 'close_x', 'open_x', 'high_x', 'low_x',
       'pre_close_x', 'change_x', 'pct_chg_x', 'vol_x', 'amount_x',
       'ts_code_y', 'close_y', 'open_y', 'high_y', 'low_y', 'pre_close_y',
       'change_y', 'pct_chg_y', 'vol_y', 'amount_y'],
      dtype='object')

In [60]:
dealt=source_data.drop(columns=['ts_code_x','ts_code_y','pre_close_x','pre_close_y',
                                'change_x','change_y','amount_x','amount_y','open_x','open_y',
                                'high_x','high_y','low_x','low_y']).iloc[::-1]

In [62]:
dealt.to_csv("SH_SZ_300.csv", index=False, sep=',')

In [48]:
"""
trade_date 交易日
close 收盘点位
pct_chg 涨跌幅（%）
vol 成交量（手）
"""

'\ntrade_date 交易日\nclose 收盘点位\npct_chg 涨跌幅（%）\nvol 成交量（手）\n'

In [76]:
pd.read_csv("SH_SZ_300.csv")

Unnamed: 0,trade_date,close_x,pct_chg_x,vol_x,close_y,pct_chg_y,vol_y
0,20100104,3243.7600,-1.0185,109447927.0,3535.2290,-1.1314,66101080.0
1,20100105,3282.1790,1.1844,126115066.0,3564.0380,0.8149,85809641.0
2,20100106,3254.2150,-0.8520,123651384.0,3541.7270,-0.6260,78473125.0
3,20100107,3192.7760,-1.8880,128652827.0,3471.4560,-1.9841,80350037.0
4,20100108,3195.9970,0.1009,98377147.0,3480.1300,0.2499,60790253.0
...,...,...,...,...,...,...,...
2707,20210225,3585.0458,0.5883,366232299.0,5469.5584,0.5883,230736028.0
2708,20210226,3509.0804,-2.1190,333280067.0,5336.7609,-2.4279,212264543.0
2709,20210301,3551.3998,1.2060,315487526.0,5418.7837,1.5369,177463523.0
2710,20210302,3508.5912,-1.2054,339830486.0,5349.6301,-1.2762,196511115.0


In [31]:
data_set=pd.read_csv("SH_SZ_300.csv")

In [6]:
unit_size=60
unit_number=data_set.shape[0]-unit_size+1
data_column=['close_x','close_y','vol_x']

In [29]:
import torch
from torch.utils.data import Dataset

In [32]:
trans_data=TransformerData(data_set,60,7)

In [30]:
class TransformerData(Dataset):
    def __init__(self, data_set: pd.DataFrame, unit_size: int, predict_size: int):
        """
        :param data_set: data set for transformer
        :param unit_size: number of days for a unit
        """
        self.data_set = data_set
        self.unit_size = unit_size
        self.predict_size = predict_size
        self.unit_number = int(data_set.shape[0] - unit_size + 1)
        self.data_columns = []

        data_size = data_set.shape[-1]
        self.max_set = np.zeros((self.unit_number, data_size))
        self.min_set = np.zeros((self.unit_number, data_size))

        self.feature = np.zeros(1)
        self.en_x = self.de_x = self.de_y = []
        self.anti_feature = pd.DataFrame()

    def create_dataset(self, column: list):
        """
        create dataset for encoder input(en_x),decoder input(de_x) and decoder output(y)
        :param column: specified data name
        """
        self.data_set = self.data_set.get(column).values
        self.data_columns = column

        # create unit data
        source_data = []
        for unit_i in range(self.unit_number):
            source_data.append(self.data_set[unit_i:unit_i + self.unit_size])
        source_data = np.array(source_data).astype(float)

        # normalized data set
        norm_data, self.max_set, self.min_set = max_min_normalised(source_data)
        # create encoder and decoder input data set
        self.en_x = self.de_x = norm_data[:, :-self.predict_size]
        self.de_y = norm_data[:, self.predict_size:]

    def __len__(self):
        return len(self.en_x)

    def __getitem__(self, item: int):
        return self.en_x[item], self.de_x[item], self.de_y[item]

    def generate_feature(self, model):
        """
        generate feature by model
        :param model: trained model
        """
        # feature sequence, all feature data
        feature_seq = []
        group_num = int(self.unit_number / self.unit_size)

        for unit_i in range(group_num):
            tmp_en = torch.as_tensor(self.en_x[unit_i * self.unit_size], dtype=torch.float32)
            tmp_de = torch.as_tensor(self.de_x[unit_i * self.unit_size], dtype=torch.float32)

            for tmp_i in range(self.unit_size):
                tmp = model(tmp_en[np.newaxis, :, :], tmp_de[np.newaxis, :, :]).detach().numpy()
                feature_seq.append(tmp)

        self.feature = np.array(feature_seq)

    def generate_data(self):
        """
        according to feature,anti_normalized all data
        """
        generate_data = []
        group_num = int(self.unit_number / self.unit_size)

        for unit_i in range(group_num):
            feature_tmp = self.feature[unit_i * self.unit_size:(unit_i+1) * self.unit_size]
            generate_data.append(anti_max_min_normalised(feature_tmp, self.max_set[unit_i], self.min_set[unit_i]))

        self.anti_feature = pd.DataFrame(
            np.array(generate_data).reshape(-1, self.feature.shape[-1]),
            columns=self.data_columns
        )

In [33]:
def anti_max_min_normalised(norm_data: np.array, max_set: float, min_set: float):
    """
    Maximum and minimum anti_normalization
    :param norm_data: a unit data
    :param max_set: maximum data in norm_data
    :param min_set: minimum data in norm_data
    :return:
        anti_data: anti_normalised data
    """
    anti_data = []
    for col_i in range(norm_data.shape[0]):
        data_i = norm_data[col_i]
        data_i = data_i * min_set + max_set

        anti_data.append(data_i)
    return np.array(anti_data)


In [None]:
def max_min_normalised(data: np.array):
    """
    Maximum and minimum normalization
    :param data: data that needs to be normalized
    :return:
        max_set: maximum data in each unit
        min_set: maximum data in each unit
        normal_data: normalized data
    """
    unit_num = data.shape[0]
    
    max_set = np.zeros(unit_num,data.shape[-1])
    min_set = np.zeros(unit_num,data.shape[-1])
    normal_data = []

    for col_i in range(col_num):
        data_i = data[col_i]

        min_set[col_i] = min(data_i)
        data_i = data_i - min(data_i)
        max_set[col_i] = max(data_i)
        data_i = data_i / max(data_i)

        normal_data.append(data_i)

    return np.array(normal_data), max_set, min_set

In [39]:
test=np.arange(0,60).reshape(5,4,3)

In [40]:
max_min_normalised(test)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [49]:
np.max(test[0],axis=0)

array([ 9, 10, 11])

In [47]:
test[0]

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])