# Training

In [14]:
import tensorflow as tf
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
from alphanet import AlphaNetV3
from alphanet.data import TimeSeriesData, TrainValData
from alphanet.metrics import UpDownAccuracy

## Data Processing

train_val_gap参数为验证集第一天与训练集最后一天中间间隔的天数，
如果是相临，则train_val_gap = 0。设置该参数的目的如下：
如果希望预测未来十天的累计收益，则预测时用到的输入数据为最近的历史数据来预测
未来十天的累计收益，即用t(-history)到t(0)的数据来预测t(1)到t(11)的累计收益
而训练时因为要用到十天累计收益做标签，最近的一个十天累计收益是从t(-10)到t(0)，
用到的历史数据则必须是t(-history-11)到t(-11)的数据，为了确保validation的
有效性，则最好将validation的第一个数据位置与train的最后一个数据位置在时间上
相差11天，即间隔10天，因此使用train_val_gap=10。

时间为t(0)的每个样本数据的历史数据时间范围为t(-30)至t(0)，即用到了当天的收盘数据。
标签数据为 p(11) / p(1) - 1，即往后11天的收盘价除以明天的收盘价减去1。
不用 p(10) / p(0) - 1 的原因是，当天收盘时未做预测，不能以当天收盘价购入。

In [15]:
# 读取数据
csi = pd.read_csv("./data/CSI500.zip", dtype={"代码": "category",
                                              "简称": "category"})
csi.drop(columns=["简称"], inplace=True)
csi = csi.loc[csi["日期"] >= 20110131, :]

# 新增特征
csi["close/free_turn"] = csi["收盘价(元)"] / csi["换手率(基准.自由流通股本)"]
csi["open/turn"] = csi["开盘价(元)"] / csi["换手率(%)"]
csi["volume/low"] = csi["成交量(股)"] / csi["最低价(元)"]
csi["vwap/high"] = csi["均价"] / csi["最高价(元)"]
csi["low/high"] = csi["最低价(元)"] / csi["最高价(元)"]
csi["vwap/close"] = csi["均价"] / csi["收盘价(元)"]

# 计算t(1)至t(11)回报
trading_dates = csi["日期"].unique()
trading_dates.sort()
dates_shift_dictionary_1 = dict(zip(trading_dates[1:], trading_dates[:-1]))
dates_shift_dictionary_11 = dict(zip(trading_dates[11:], trading_dates[:-11]))
csi_slice = csi[["代码", "日期", "收盘价(元)"]].copy()
csi_slice_date_shift_1 = csi[["代码", "日期", "收盘价(元)"]].copy()
csi_slice_date_shift_11 = csi[["代码", "日期", "收盘价(元)"]].copy()
csi_slice_date_shift_1["日期"] = csi_slice_date_shift_11["日期"]\
    .map(lambda x: dates_shift_dictionary_1.get(x, None))
csi_slice_date_shift_11["日期"] = csi_slice_date_shift_11["日期"]\
    .map(lambda x: dates_shift_dictionary_11.get(x, None))
csi_slice_date_shift_11.rename(columns={"收盘价(元)": "11交易日后收盘价(元)"},
                               inplace=True)
csi_slice_date_shift_11.dropna(inplace=True)
csi_slice_date_shift_1.rename(columns={"收盘价(元)": "1交易日后收盘价(元)"},
                              inplace=True)
csi_slice_date_shift_1.dropna(inplace=True)
csi_slice = csi_slice.merge(csi_slice_date_shift_1,
                            how="inner",
                            left_on=["代码", "日期"],
                            right_on=["代码", "日期"])
csi_slice = csi_slice.merge(csi_slice_date_shift_11,
                            how="inner",
                            left_on=["代码", "日期"],
                            right_on=["代码", "日期"])
future_close_price_1 = csi_slice["1交易日后收盘价(元)"]
future_close_price_11 = csi_slice["11交易日后收盘价(元)"]
csi_slice["10日回报率"] = future_close_price_11 / future_close_price_1 - 1
csi_slice.drop(columns=["收盘价(元)", "11交易日后收盘价(元)", "1交易日后收盘价(元)"], inplace=True)
csi = csi_slice.merge(csi,
                      how="inner",
                      left_on=["代码", "日期"],
                      right_on=["代码", "日期"])

In [16]:
csi.head()

Unnamed: 0,代码,日期,10日回报率,收盘价(元),最高价(元),最低价(元),开盘价(元),涨跌幅(%),换手率(%),成交量(股),换手率(基准.自由流通股本),均价,close/free_turn,open/turn,volume/low,vwap/high,low/high,vwap/close
0,000723.SZ,20110131,0.029113,25.4634,25.5495,24.8087,24.8087,1.931,1.1371,1042874.0,1.3413,25.300419,18.98412,21.817518,42036.624249,0.990251,0.971005,0.993599
1,000723.SZ,20110201,0.087316,25.4461,25.6873,25.1705,25.4634,-0.0677,0.7551,692549.0,0.8907,25.520747,28.568654,33.721891,27514.31239,0.993516,0.979881,1.002934
2,000723.SZ,20110209,0.077596,24.8604,25.36,24.2229,24.2229,-2.302,0.7378,676673.0,0.8703,25.072293,28.565322,32.831255,27935.25961,0.988655,0.955162,1.008523
3,000723.SZ,20110210,0.099998,25.5323,25.6701,24.5158,24.533,2.7027,1.0751,985959.0,1.2681,25.238911,20.134295,22.819273,40217.288443,0.983203,0.955033,0.988509
4,000723.SZ,20110211,0.062422,25.8424,25.9113,25.36,25.5323,1.2146,1.1393,1044835.0,1.3438,25.786233,19.230838,22.410515,41200.118297,0.995173,0.978724,0.997827


## Build TimeSeriesData

In [None]:
codes = csi.代码.cat.categories
stock_data_list = []
for code in codes:
    table_part = csi.loc[csi.代码 == code, :]
    stock_data_list.append(TimeSeriesData(dates=table_part["日期"].values,
                                          data=table_part.iloc[:, 3:].values,
                                          labels=table_part["10日回报率"].values))


## Define Training Functions

In [None]:
def try_mkdirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

try_mkdirs("./models")

data_producer = TrainValData(time_series_list=stock_data_list,
                             train_length=1200,
                             validate_length=300,
                             history_length=30,
                             sample_step=2,
                             train_val_gap=10)


def print_dates_info(dates_info):
    print("Start_training for period {} to {}:\n"
          "\t the training set is {}:{} (inclusive)\n"
          "\t the validation set is {}:{} (inclusive)".format(
        dates_info["training"]["start_date"],
        dates_info["validation"]["end_date"],
        dates_info["training"]["start_date"],
        dates_info["training"]["end_date"],
        dates_info["validation"]["start_date"],
        dates_info["validation"]["end_date"]
    ))


def do_training(beginning_date,
                training_id,
                epochs=100,
                batch_size=500,
                early_stopping_patience=30):

    # get training data starting from beginning_date

    try:
        train, val, dates_info = data_producer.get(beginning_date)
    except ValueError as e:
        print(beginning_date, e)
        return None

    # print dates information
    print_dates_info(dates_info)

    # build model
    net = AlphaNetV3(metrics=[tf.keras.metrics.RootMeanSquaredError(),
                              UpDownAccuracy()])
    m = net.model()

    # create a folder to save information for this model
    path_str = "./models/{training_id:02d}/{beginning_date:08d}/"
    folder_path = path_str.format(training_id=training_id,
                                  beginning_date=beginning_date)
    try_mkdirs(folder_path)

    # write dates information
    json_path = folder_path + "dates_info.json"
    with open(json_path, "w") as fp:
        json.dump(dates_info, fp)

    # save model weights per epoch in folders
    file_path = folder_path + "{epoch:04d}-{val_loss:.6f}.hdf5"
    ckp = tf.keras.callbacks.ModelCheckpoint(filepath=file_path,
                                             save_freq="epoch",
                                             save_weights_only=True)

    # early stopping
    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                          patience=early_stopping_patience,
                                          restore_best_weights=True)

    # fit model
    h = m.fit(train.batch(batch_size).cache(),
              validation_data=val.batch(batch_size).cache(),
              epochs=epochs,
              callbacks=[es, ckp])

    # save weights of the best model
    net.save_weights(folder_path + "best.hdf5")

    # write history
    history_path = folder_path + "history.json"
    with open(history_path, "w") as fp:
        json.dump(h.history, fp)

    return h, net, dates_info

## Training

In [None]:
ROLLING_BEGINNING_LIST = [20110131,
                          20110731,
                          20120131,
                          20120731,
                          20130131,
                          20130731,
                          20140131,
                          20140731,
                          20150131,
                          20150731]

TRAINING_ID = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# 华泰原来对每个模型训练了十次，最后取平均值
for tid in TRAINING_ID:
    for beginning in ROLLING_BEGINNING_LIST:
        returned = do_training(beginning_date=beginning,
                               training_id=tid)
        if returned:
            history, _, _ = returned
            pd.DataFrame(history.history).plot()
            plt.title(f"{beginning}")
            plt.show()