In [2]:
# -*- coding: utf-8 -*-

from __context__ import *
from __future__ import print_function, unicode_literals, division

import pandas as pd
import numpy as np
import time
from datetime import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorlayer as tl
from IPython.core.display import clear_output

sess = None

In [3]:
def get_data_delta(df, lookforward=5):
    op, hi, lo, cl, pre_cl = df[['open', 'high', 'low', 'new_price', 'pre_close']].values.T
    d_arr = np.stack([op, hi, lo, cl], 1)
    d_arr = ((d_arr/pre_cl.reshape(-1, 1) - 1) * 1000).astype(int) + 100
    d_oc = (d_arr[:, 3] - d_arr[:, 0]).reshape(-1, 1)
    expr = np.full(d_oc.shape, np.nan)
    expr[:-lookforward, 0] = cl[lookforward:] / cl[:-lookforward] - 1
    out_df = pd.DataFrame(np.hstack([d_arr, d_oc]), index=df.index, 
                          columns=['d_open', 'd_high', 'd_low', 'd_close', 'd_oc'])
    out_df.loc[:, 'expr03'] = expr
    return out_df.dropna()

def compile_data(data_d):
    channel_1 = np.zeros(shape=(data_d.shape[0], 201))
    channel_2 = np.zeros(shape=(data_d.shape[0], 201))
    idx_begin = np.arange(data_d.shape[0]) * 201

    lb1 = data_d[['d_open', 'd_close']].min(1).values
    ub1 = data_d[['d_open', 'd_close']].max(1).values + 1
    channel_1 = np.zeros(shape=(data_d.shape[0], 201))
    idx_to_fill_chan1 = [np.arange(begi, endi) for begi, endi in zip(idx_begin + lb1, idx_begin + ub1)]
    np.put(channel_1, np.hstack(idx_to_fill_chan1), 1)

    lb2 = data_d['d_low'].values
    ub2 = data_d['d_high'].values + 1
    channel_2 = np.zeros(shape=(data_d.shape[0], 201))
    idx_to_fill_chan2 = [np.arange(begi, endi) for begi, endi in zip(idx_begin + lb2, idx_begin + ub2)]
    np.put(channel_2, np.hstack(idx_to_fill_chan2), 1)

    features = np.stack((channel_1, channel_2), axis=-1)
    # print('feature shape', features.shape)
    return features

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def get_data_batch_2d(features, ys, time_step):
    features_T = features.transpose((1, 2, 0))
    features_T_rol = rolling_window(features_T, time_step)
    data_x = features_T_rol.transpose(2, 3, 0, 1)
    data_y = ys[time_step-1:]
    return np.array(data_x), np.array(data_y)

def get_training_data(quota, batch_size=100, period_step=20, predict_period=5):
    data2 = quota.copy()
    predict_data2 = get_data_delta(data2, lookforward=predict_period)

    predict_features2 = compile_data(predict_data2)
    _ys2 = predict_data2.loc[:, 'expr03'].values
    predict_x2, predict_dump2 = get_data_batch_2d(predict_features2, _ys2, period_step)
    rest = batch_size - (predict_x2.shape[0] % batch_size)
    # print('predict_x:', predict_x2.shape)

    predict_x2 = np.concatenate([predict_x2, predict_x2[:rest]])
    predict_dump2 = np.concatenate([predict_dump2, predict_dump2[:rest]])
    return predict_x2, predict_dump2

## 构建数据

## 获取行情

In [137]:
begdate = 20080601
enddate = 20180601
code = '000002.SZ'
predict_period=5

DB = gcrl.db('internal')
DB.query_init({'code': code, 'begin': begdate, 'end': enddate, 'fields': '*'})
DB.query.update({'dtype':'cycle', 'n_cycle': 1, 'freq': pycoraldb.D})
rawdf = DB.getBar(4)
rawdf['close'] = rawdf['new_price'].copy()
# rawdf = db.getBar(code, 20180501, 20180601).toDataFrame()
# gclean = gfc.clean_rdata(rawdf)
# clean_df = gclean.remove_by_time('stock')

In [487]:
def compile_data(rawdf, window=3, lookforward=1, verify=True):
    # get X
    df = rawdf.loc[rawdf['status']==0, ['open', 'high', 'low', 'new_price', 'new_volume']].dropna()
    delta = df.pct_change().dropna().values
    # delta = delta[~np.isinf(delta).any(1)] * 10000 + 10000   
    X = (10000 * (delta + 1)).astype(int)
    # get y
    rtn = rawdf.loc[rawdf['status']==0, 'new_price'].pct_change(lookforward).dropna().values
    y = (10000 * (rtn + 1)).astype(int)
    # Generat snapshots with targets
    X_snaps = rolling_window(X.T, window).transpose(1, 2, 0)
    X_snaps = X_snaps.reshape(X_snaps.shape+(1,))[1:]
    y_target = y[window:]
    if verify:
        print('Before rolling: ', X.shape, y.shape)
        assert X.shape[0] == y.shape[0]
        print('After rolling: ', X_snaps.shape, y_target.shape)
        assert X_snaps.shape[0] == y_target.shape[0]
    return X_snaps, y_target

In [488]:
def batch_trimmer(X, y, batch_size=100):
    assert X.shape[0] == y.shape[0]
    trim_n = (X.shape[0] // batch_size) * batch_size
    return X[:trim_n], y[:trim_n]

In [489]:
data_x, data_y = compile_data(rawdf, window=3, lookforward=1, verify=False)
data_x, data_y = batch_trimmer(data_x, data_y, 100)

In [490]:
training_count = int(data_x.shape[0] * 0.8)
test_count = data_x.shape[0] - training_count

training_x = data_x[:training_count]
training_y = data_y[:training_count]
test_x = data_x[training_count:]
test_y = data_y[training_count:]

print('训练集x', training_x.shape)
print('测试集x', test_x.shape)
print('总共x', data_x.shape[0])
assert(training_x.shape[0] + test_x.shape[0] == data_x.shape[0])

训练集x (1760, 3, 5, 1)
测试集x (440, 3, 5, 1)
总共x 2200


In [491]:
X_train, y_train, X_test, y_test  = training_x.copy(), training_y.copy(), test_x.copy(), test_y.copy()

In [492]:
y_train = np.where(y_train > 10500, 1, np.where(y_train < 9700, 2, 0)).astype(int)
y_test = np.where(y_test > 10500, 1, np.where(y_test < 9700, 2, 0)).astype(int)

In [493]:
y_test[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [516]:
def model(x, is_train=True, reuse=False):
    # In BNN, all the layers inputs are binary, with the exception of the first layer.
    # ref: https://github.com/itayhubara/BinaryNet.tf/blob/master/models/BNN_cifar10.py
    with tf.variable_scope("binarynet", reuse=reuse):
        
        net = tl.layers.InputLayer(x, name='input')
 
        net = tl.layers.Conv2d(net, 32, (2, 2), (1, 1), act=tf.nn.relu, padding='SAME', name='cnn2d')
        # net = tl.layers.MaxPool2d(net, (2, 2), (2, 2), padding='SAME', name='pool0')
        net = tl.layers.BatchNormLayer(net, act=tl.act.htanh, is_train=is_train, name='bn0')
        
        net = tl.layers.BinaryConv2d(net, 32, (2, 2), (1, 1),
                                     padding='SAME', b_init=None, name='bcnn1')
        net = tl.layers.MaxPool2d(net, (2, 2), (2, 2), padding='SAME', name='pool1')
        net = tl.layers.BatchNormLayer(net, act=tl.act.htanh, is_train=is_train, name='bn1')

        net = tl.layers.BinaryConv2d(net, 64, (2, 2), (1, 1),
                                     padding='SAME', b_init=None, name='bcnn2')
        net = tl.layers.MaxPool2d(net, (2, 2), (2, 2), padding='SAME', name='pool2')
        net = tl.layers.BatchNormLayer(net, act=tl.act.htanh, is_train=is_train, name='bn2')

        net = tl.layers.FlattenLayer(net)
        # net = tl.layers.DropoutLayer(net, 0.8, True, is_train, name='drop1')
        # net = tl.layers.SignLayer(net)
        net = tl.layers.BinaryDenseLayer(net, 5, b_init=None, name='dense')
        net = tl.layers.BatchNormLayer(net, act=tl.act.htanh, is_train=is_train, name='bn3')

        # net = tl.layers.DropoutLayer(net, 0.8, True, is_train, name='drop2')
        # net = tl.layers.SignLayer(net)
        net = tl.layers.BinaryDenseLayer(net, 3, b_init=None, name='bout')
        net = tl.layers.BatchNormLayer(net, act=tf.nn.sigmoid, is_train=is_train, name='bno')
    return net

In [517]:
sess = None
if sess == None:
    sess = tf.InteractiveSession()
elif sess._closed == False:
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()



In [518]:
tf.reset_default_graph()
tf.InteractiveSession.close(sess)
sess.close()
sess = tf.InteractiveSession()



In [519]:
# define inferences
batch_size = 100
x = tf.placeholder(tf.float32, shape=[batch_size, 3, 5, 1])
y_ = tf.placeholder(tf.int64, shape=[batch_size])
net_train = model(x, is_train=True, reuse=False)
net_test = model(x, is_train=False, reuse=True)

[TL] InputLayer  binarynet/input: (100, 3, 5, 1)
[TL] Conv2d binarynet/cnn2d: n_filter:32 filter_size:(2, 2) strides:(1, 1) pad:SAME act:relu
[TL] BatchNormLayer bn0: decay:0.900000 epsilon:0.000010 act:hard_tanh is_train:True
[TL] BinaryConv2d bcnn1: n_filter:32 filter_size:(2, 2) strides:(1, 1) pad:SAME act:identity
[TL] MaxPool2d pool1: filter_size:(2, 2) strides:(2, 2) padding:SAME
[TL] BatchNormLayer bn1: decay:0.900000 epsilon:0.000010 act:hard_tanh is_train:True
[TL] BinaryConv2d bcnn2: n_filter:64 filter_size:(2, 2) strides:(1, 1) pad:SAME act:identity
[TL] MaxPool2d pool2: filter_size:(2, 2) strides:(2, 2) padding:SAME
[TL] BatchNormLayer bn2: decay:0.900000 epsilon:0.000010 act:hard_tanh is_train:True
[TL] FlattenLayer binarynet/flatten: 128
[TL] BinaryDenseLayer  dense: 5 identity
[TL] BatchNormLayer bn3: decay:0.900000 epsilon:0.000010 act:hard_tanh is_train:True
[TL] BinaryDenseLayer  bout: 3 identity
[TL] BatchNormLayer bno: decay:0.900000 epsilon:0.000010 act:sigmoid is_

In [520]:
# cost for training
y = net_train.outputs
cost = tl.cost.cross_entropy(y, y_, name='xentropy')

# cost and accuracy for evalution
y2 = net_test.outputs
prediction = tf.argmax(y2, 1)

cost_test = tl.cost.cross_entropy(y2, y_, name='xentropy2')
correct_prediction = tf.equal(tf.argmax(y2, 1), y_)
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# define the optimizer
train_params = tl.layers.get_variables_with_name('binarynet', True, True)
# train_op = tl.optimizers.AMSGrad(learning_rate=0.0001).minimize(cost, var_list=train_params)
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost, var_list=train_params)

# initialize all variables in the session
tl.layers.initialize_global_variables(sess)

[TL]   [*] geting variables with binarynet
[TL]   got   0: binarynet/cnn2d/kernel:0   (2, 2, 1, 32)
[TL]   got   1: binarynet/cnn2d/bias:0   (32,)
[TL]   got   2: binarynet/bn0/beta:0   (32,)
[TL]   got   3: binarynet/bn0/gamma:0   (32,)
[TL]   got   4: binarynet/bcnn1/W_conv2d:0   (2, 2, 32, 32)
[TL]   got   5: binarynet/bn1/beta:0   (32,)
[TL]   got   6: binarynet/bn1/gamma:0   (32,)
[TL]   got   7: binarynet/bcnn2/W_conv2d:0   (2, 2, 32, 64)
[TL]   got   8: binarynet/bn2/beta:0   (64,)
[TL]   got   9: binarynet/bn2/gamma:0   (64,)
[TL]   got  10: binarynet/dense/W:0   (128, 5)
[TL]   got  11: binarynet/bn3/beta:0   (5,)
[TL]   got  12: binarynet/bn3/gamma:0   (5,)
[TL]   got  13: binarynet/bout/W:0   (5, 3)
[TL]   got  14: binarynet/bno/beta:0   (3,)
[TL]   got  15: binarynet/bno/gamma:0   (3,)


In [521]:
# net_train.print_params()
# net_train.print_layers()

[TL]   param   0: binarynet/cnn2d/kernel:0 (2, 2, 1, 32)      float32_ref (mean: 0.0016860561445355415, median: 0.001863574841991067, std: 0.01633264683187008)   
[TL]   param   1: binarynet/cnn2d/bias:0 (32,)              float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
[TL]   param   2: binarynet/bn0/beta:0 (32,)              float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
[TL]   param   3: binarynet/bn0/gamma:0 (32,)              float32_ref (mean: 0.9993168711662292, median: 0.9993283152580261, std: 0.0022547768894582987)   
[TL]   param   4: binarynet/bn0/moving_mean:0 (32,)              float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
[TL]   param   5: binarynet/bn0/moving_variance:0 (32,)              float32_ref (mean: 1.0               , median: 1.0               , std: 0.0               )   
[TL]   param   6: binarynet/bcnn1/Sign:0 (2, 2, 

In [523]:
n_epoch = 10
seq_l = 5
bas = batch_size // seq_l
for epoch in range(n_epoch):
    start_time = time.time()
    for X_train_a, y_train_a in tl.iterate.seq_minibatches(X_train, y_train, batch_size=bas, seq_length=seq_l):
        sess.run(train_op, feed_dict={x: X_train_a, y_: y_train_a})

    if True:
        print("Epoch %d of %d took %fs" % (epoch + 1, n_epoch, time.time() - start_time))
        train_loss, train_acc, n_batch = 0, 0, 0
        for X_train_a, y_train_a in tl.iterate.minibatches(X_train, y_train, batch_size, shuffle=True):
            err, ac = sess.run([cost_test, acc], feed_dict={x: X_train_a, y_: y_train_a})
            train_loss += err
            train_acc += ac
            n_batch += 1
        print("   train loss: %f" % (train_loss / n_batch))
        print("   train acc: %f" % (train_acc / n_batch))
        
        val_loss, val_acc, n_batch = 0, 0, 0
#         for X_val_a, y_val_a in tl.iterate.minibatches(X_val, y_val, batch_size, shuffle=True):
#             err, ac = sess.run([cost_test, acc], feed_dict={x: X_val_a, y_: y_val_a})
#             val_loss += err
#             val_acc += ac
#             n_batch += 1
#         print("   val loss: %f" % (val_loss / n_batch))
#         print("   val acc: %f" % (val_acc / n_batch))

        print('Evaluation')
        test_loss, test_acc, n_batch = 0, 0, 0
        for X_test_a, y_test_a in tl.iterate.minibatches(X_test, y_test, batch_size, shuffle=True):
            err, ac = sess.run([cost_test, acc], feed_dict={x: X_test_a, y_: y_test_a})
            test_loss += err
            test_acc += ac
            n_batch += 1
        print("   test loss: %f" % (test_loss / n_batch))
        print("   test acc: %f" % (test_acc / n_batch))
#         clear_output()


print('Evaluation')
test_loss, test_acc, n_batch = 0, 0, 0
for X_test_a, y_test_a in tl.iterate.minibatches(X_test, y_test, batch_size, shuffle=True):
    err, ac = sess.run([cost_test, acc], feed_dict={x: X_test_a, y_: y_test_a})
    test_loss += err
    test_acc += ac
    n_batch += 1

print("   test loss: %f" % (test_loss / n_batch))
print("   test acc: %f" % (test_acc / n_batch))

Epoch 1 of 10 took 1.443077s
   train loss: 1.090999
   train acc: 0.408824
Evaluation
   test loss: 1.091313
   test acc: 0.410000
Epoch 2 of 10 took 1.317194s
   train loss: 1.093207
   train acc: 0.277059
Evaluation
   test loss: 1.094958
   test acc: 0.282500
Epoch 3 of 10 took 1.397819s
   train loss: 1.045633
   train acc: 0.546471
Evaluation
   test loss: 1.051365
   test acc: 0.522500
Epoch 4 of 10 took 1.264396s
   train loss: 1.042805
   train acc: 0.614706
Evaluation
   test loss: 1.038184
   test acc: 0.620000
Epoch 5 of 10 took 1.387047s
   train loss: 1.086467
   train acc: 0.388824
Evaluation
   test loss: 1.084534
   test acc: 0.402500
Epoch 6 of 10 took 1.158743s
   train loss: 1.046477
   train acc: 0.543529
Evaluation
   test loss: 1.046219
   test acc: 0.562500
Epoch 7 of 10 took 1.359743s
   train loss: 1.060904
   train acc: 0.542941
Evaluation
   test loss: 1.054475
   test acc: 0.560000
Epoch 8 of 10 took 1.211658s
   train loss: 1.041197
   train acc: 0.659412


In [524]:
# 预测与实际图对比
#fig, axs = plt.subplots(1,1,figsize = (25,8))

# predict_x, predict_dump = compile_data(rawdf, 3, 1, True)
# predict_x, predict_dump = batch_trimmer(predict_x, predict_dump)

real_x, real_y = batch_trimmer(X_test, y_test)
pred_y = []
for X_train_a, y_train_a in tl.iterate.minibatches(real_x, real_y, 100, shuffle=False):
#     print('batch predict')
    # pred.append(sess.run(prediction, feed_dict={x: X_train_a}))
    pred_y.append(sess.run(y2, feed_dict={x: X_train_a}))

pred_y = np.vstack(pred_y)

In [525]:
print('real_y:', real_y.shape)
# print('predict_x:', predict_x.shape)
print('pred_y:', pred_y.shape)

real_y: (400,)
pred_x: (400, 3)


In [527]:
# pred_x[:, 0] - real_y
sum(pred_y.argmax(1) == real_y) / real_y.shape[0]

0.595