# Training

In [None]:
import tensorflow as tf
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
from alphanet import AlphaNetV3
from alphanet.data import TimeSeriesData, TrainValData
from alphanet.metrics import UpDownAccuracy

## Data Processing

In [None]:
# 读取数据
csi = pd.read_csv("./data/CSI500.zip", dtype={"代码": "category",
                                              "简称": "category"})
csi.drop(columns=["简称"], inplace=True)
csi = csi.loc[csi["日期"] >= 20110131, :]

# 新增特征
csi["close/free_turn"] = csi["收盘价(元)"] / csi["换手率(基准.自由流通股本)"]
csi["open/turn"] = csi["开盘价(元)"] / csi["换手率(%)"]
csi["volume/low"] = csi["成交量(股)"] / csi["最低价(元)"]
csi["vwap/high"] = csi["均价"] / csi["最高价(元)"]
csi["low/high"] = csi["最低价(元)"] / csi["最高价(元)"]
csi["vwap/close"] = csi["均价"] / csi["收盘价(元)"]

# 计算十日回报
trading_dates = csi["日期"].unique()
trading_dates.sort()
dates_shift_dictionary = dict(zip(trading_dates[10:], trading_dates[:-10]))
csi_slice = csi[["代码", "日期", "收盘价(元)"]].copy()
csi_slice_date_shift = csi[["代码", "日期", "收盘价(元)"]].copy()
csi_slice_date_shift["日期"] = csi_slice_date_shift["日期"]\
    .map(lambda x: dates_shift_dictionary.get(x, None))
csi_slice_date_shift.rename(columns={"收盘价(元)": "10交易日后收盘价(元)"},
                            inplace=True)
csi_slice_date_shift.dropna(inplace=True)
csi_slice_date_shift["日期"] = [d for d in csi_slice_date_shift["日期"]]
csi_slice = csi_slice.merge(csi_slice_date_shift,
                            how="inner",
                            left_on=["代码", "日期"],
                            right_on=["代码", "日期"])
close_price = csi_slice["收盘价(元)"]
future_close_price = csi_slice["10交易日后收盘价(元)"]
csi_slice["10日回报率"] = future_close_price / close_price - 1
csi_slice.drop(columns=["收盘价(元)", "10交易日后收盘价(元)"], inplace=True)
csi = csi_slice.merge(csi,
                      how="inner",
                      left_on=["代码", "日期"],
                      right_on=["代码", "日期"])

## Build TimeSeriesData

In [None]:
codes = csi.代码.cat.categories
stock_data_list = []
for code in codes:
    table_part = csi.loc[csi.代码 == code, :]
    stock_data_list.append(TimeSeriesData(dates=table_part["日期"].values,
                                          data=table_part.iloc[:, 3:].values,
                                          labels=table_part["10日回报率"].values))


## Define Training Functions

In [None]:
def try_mkdirs(path):
    if not os.path.exists(path):
        os.makedirs(path)

try_mkdirs("./models")

data_producer = TrainValData(time_series_list=stock_data_list,
                             train_length=1200,
                             validate_length=300,
                             history_length=30,
                             sample_step=2)


def print_dates_info(dates_info):
    print("Start_training for period {} to {}:\n"
          "\t the training set is {}:{} (inclusive)\n"
          "\t the validation set is {}:{} (inclusive)".format(
        dates_info["training"]["start_date"],
        dates_info["validation"]["end_date"],
        dates_info["training"]["start_date"],
        dates_info["training"]["end_date"],
        dates_info["validation"]["start_date"],
        dates_info["validation"]["end_date"]
    ))


def do_training(beginning_date,
                training_id,
                epochs=100,
                batch_size=500,
                early_stopping_patience=10):

    # get training data starting from beginning_date

    try:
        train, val, dates_info = data_producer.get(beginning_date)
    except ValueError as e:
        print(beginning_date, e)
        return None

    # print dates information
    print_dates_info(dates_info)

    # build model
    net = AlphaNetV3(metrics=[tf.keras.metrics.RootMeanSquaredError(),
                              UpDownAccuracy()])
    m = net.model()

    # create a folder to save information for this model
    path_str = "./models/{training_id:02d}/{beginning_date:08d}/"
    folder_path = path_str.format(training_id=training_id,
                                  beginning_date=beginning_date)
    try_mkdirs(folder_path)

    # write dates information
    json_path = folder_path + "dates_info.json"
    with open(json_path, "w") as fp:
        json.dump(dates_info, fp)

    # save model weights per epoch in folders
    file_path = folder_path + "{epoch:04d}-{val_loss:.6f}.hdf5"
    ckp = tf.keras.callbacks.ModelCheckpoint(filepath=file_path,
                                             save_freq="epoch",
                                             save_weights_only=True)

    # early stopping
    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                          patience=early_stopping_patience,
                                          restore_best_weights=True)

    # fit model
    h = m.fit(train.batch(batch_size).cache(),
              validation_data=val.batch(batch_size).cache(),
              epochs=epochs,
              callbacks=[es, ckp])

    # save weights of the best model
    net.save_weights(folder_path + "best.hdf5")

    # write history
    history_path = folder_path + "history.json"
    with open(history_path, "w") as fp:
        json.dump(h.history, fp)

    return h, net, dates_info

## Training

In [None]:
ROLLING_BEGINNING_LIST = [20110131,
                          20110731,
                          20120131,
                          20120731,
                          20130131,
                          20130731,
                          20140131,
                          20140731,
                          20150131,
                          20150731]

TRAINING_ID = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# 华泰原来对每个模型训练了十次，最后取平均值
for tid in TRAINING_ID:
    for beginning in ROLLING_BEGINNING_LIST:
        returned = do_training(beginning_date=beginning,
                               training_id=tid)
        if returned:
            history, _, _ = returned
            pd.DataFrame(history.history).plot()
            plt.title(f"{beginning}")
            plt.show()