In [1]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tensorflow as tf

%matplotlib inline 
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
#有中文出现的情况，需要u'内容'

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from datum import *
import model
import imp

imp.reload(model)

<module 'model' from 'model.pyc'>

In [2]:
data = Datum()
data.data_prepare()
data.get_embedding('embedding_all.emb')
data.supervised_data_prepare()
data.evaluation_prepare()
data.label_trend()
data.label_return()

In [14]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from sklearn import linear_model as LR
from sklearn import metrics as mt
import scipy.stats as stats

import collections
import math
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from six.moves import xrange

import datum

data_index = 0
len_stock = 3145
len_fund  = 2199

def stock2all(a):
    b = a.swapaxes(0, 2)
    c = b.reshape(b.shape[0], -1)
    d = c.T
    return d

a = np.array(range(27)).reshape((3, 3, 3))
b = stock2all(a)

class Model():
    def __init__(self, learning_rate_rank):
        self.save_rank = learning_rate_rank
        self.learning_rate = 1 / np.power(10, learning_rate_rank)
        self.batch_size = 10

    def data_initial(self, datum = None):
        if datum is not None:
            self.data = datum
        else:
            self.data = datum.Datum()
            self.data.data_prepare()
            self.data.evaluation_prepare()
            self.data.label_trend()
            self.data.label_return()
        
    def data_split(self):
        self.day_sample = self.data.price_data.shape[1]
        self.stock_sample = self.data.price_data.shape[0]
        
        self.train_vali = self.day_sample // 2
        self.vali_test = self.train_vali + self.day_sample // 4
        
        self.rank_day = np.array(range(self.train_vali))
        self.rank_stock = np.array(range(self.stock_sample))

        use_index = []
        for stock_idx in range(self.stock_sample):
            stock_name = str(self.data.code_tag[stock_idx])
            for _ in range(6-len(stock_name)):
                stock_name = '0' + stock_name
            if stock_name not in self.data.list_stocks:
                continue
            else:
                use_index.append(stock_idx)
        self.use_index = np.array(use_index)        
        
    def factor_network(self):
        learning_rate = self.learning_rate
        
        self.embedding = tf.placeholder(tf.float32, shape=[32, None], name='embedding')
        self.factor = tf.placeholder(tf.float32, shape=[44, None], name='factor')
        self.factor_index = tf.placeholder(tf.int32, shape=[5], name='factor_index')
        self.label = tf.placeholder(tf.int32, shape=[None, 2], name='lable')
        
        self.weight = tf.get_variable(name='weight', shape=[76, 44], initializer=tf.truncated_normal_initializer(stddev=1.0))
        self.bias = tf.get_variable(name='bias', shape=[44], initializer=tf.zeros_initializer)
        
        self.weight_index = tf.concat([self.factor_index, tf.constant([i for i in range(44, 76)])], axis=0)
        self.weight_select = tf.nn.embedding_lookup(self.weight, self.weight_index)
        self.factor_select = tf.nn.embedding_lookup(self.factor, self.factor_index)
        
        self.part_input = tf.concat([self.factor_select, self.embedding], axis=0)
        self.part_input = tf.transpose(self.part_input)
        self.hidden = tf.matmul(self.part_input, self.weight_select) + self.bias 
        self.hidden_select = tf.nn.embedding_lookup(tf.transpose(self.hidden), self.factor_index)
        
        all_feature = tf.concat([tf.transpose(self.factor_select), tf.transpose(self.hidden_select)], 1)
        self.pred = tf.contrib.layers.fully_connected(
            inputs=all_feature,
            num_outputs=2,  # hidden
            activation_fn=tf.tanh,
            weights_initializer=tf.truncated_normal_initializer(stddev=1.0),
            biases_initializer=tf.zeros_initializer()
        )
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.pred, labels=self.label))
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(self.cost)
        
        self.all_input = tf.concat([self.factor, self.embedding], axis=0)
        self.all_input = tf.transpose(self.all_input)
        self.evaluation = tf.matmul(self.all_input, self.weight) + self.bias
        
    def training(self):
        epochs = 50
        batch_num = self.train_vali // self.batch_size
        saver = tf.train.Saver(max_to_keep=None)
        with tf.Session() as sess:
            saver.restore(sess, 'model_initial2/logmodel.ckpt')
            for epoch in range(epochs):
                print('epoch: {}'.format(epoch))
                # validation
                new_f_total = []
                for stock_idx in range(self.use_index):
                    price = self.data.price_data[stock_idx, self.train_vali:self.vali_test]
                    feature = self.data.feature_data[stock_idx, self.train_vali:self.vali_test, :]
                    label = self.data.ar_trend[stock_idx, self.train_vali:self.vali_test, :]
                    stock_name = str(self.data.code_tag[stock_idx])
                    embed = np.expand_dims(self.data.embedding[self.data.list_stocks.index(stock_name)], axis=1)
                    factor_index = random.sample(list(range(44)), 5)
                    feed_dict = {self.embedding: embed, self.factor: feature.T, self.factor_index: factor_index, self.label: label}
                    new_f= sess.run(self.evaluation, feed_dict=feed_dict)        
                    new_f_total.append(new_f)
                new_f_total = np.array(new_f_total)
                use_index= self.use_index
                ic = np.zeros((44, self.vali_test-self.train_vali, 4))
                for day in range(0, self.vali_test-self.train_vali):
                    for fac in range(44):
                        for id_idx in range(4):
                            rank_ic = stats.spearmanr(self.data.ar_ic[use_index, day, id_idx], new_f_total[:, day, fac])
                            ic[fac, day, id_idx] = rank_ic[0]
                ic = ic.mean(axis=1)
                print(ic)
                if not os.path.exists('model-2-'+str(self.save_rank)):
                    os.mkdir('model-2-'+str(self.save_rank))
                    os.mkdir('data/evaluation-2-'+str(self.save_rank))
                pd.DataFrame(ic).to_csv('data/evaluation-2-{}/epoch_evaluation_{}.csv'.format(self.save_rank, epoch))
                saver.save(sess, 'model-2-{}/logmodel.ckpt'.format(self.save_rank), global_step=epoch)
                
                np.random.shuffle(self.rank_day)
                loss_all = 0
                loss_count = 0
                for batch_count in range(batch_num):#batch_num
                    print('batch_count:{}'.format(batch_count))
                    np.random.shuffle(self.rank_stock)
                    for stock_count, stock_idx in enumerate(self.rank_stock):
                        price = self.data.price_data[stock_idx, self.rank_day[batch_count*self.batch_size:(batch_count+1)*self.batch_size]]
                        feature = self.data.feature_data[stock_idx, self.rank_day[batch_count*self.batch_size:(batch_count+1)*self.batch_size], :]
                        label = self.data.ar_trend[stock_idx, self.rank_day[batch_count*self.batch_size:(batch_count+1)*self.batch_size], :]
                        stock_name = str(self.data.code_tag[stock_idx])
                        for _ in range(6-len(stock_name)):
                            stock_name = '0' + stock_name
                        if stock_name not in self.data.list_stocks:
                            continue
                        else:
                            embed = np.expand_dims(self.data.embedding[self.data.list_stocks.index(stock_name)], axis=1)
                        for epoch_factor in range(30):
                            factor_index = random.sample(list(range(44)), 5)
                            feed_dict = {self.embedding: embed, self.factor: feature.T, self.factor_index: factor_index, self.label: label}
                            _, loss_val = sess.run([self.optimizer, self.cost], feed_dict=feed_dict)
                            loss_all += loss_val
                            loss_count += 1
                    print('avg_loss: {}'.format(loss_all/loss_count))
                    loss_all = 0
                    loss_count = 0


    def test(self):
        epochs = 50
        batch_num = self.train_vali // self.batch_size
        saver = tf.train.Saver(max_to_keep=None)
        with tf.Session() as sess:
            for model_num in range(14, 15):
                saver.restore(sess, 'baselineModel-5/logmodel-{}.ckpt'.format(model_num))
                # test
                new_f_total = []
                use_index = []
                for stock_idx in self.use_index:
                    price = self.data.price_data[stock_idx, :]
                    feature = self.data.feature_data[stock_idx, :, :]
                    label = self.data.ar_trend[stock_idx, :5, :]
                    stock_name = str(self.data.code_tag[stock_idx])
                    for _ in range(6-len(stock_name)):
                        stock_name = '0' + stock_name                       
                    embed = np.expand_dims(self.data.embedding[self.data.list_stocks.index(stock_name)], axis=0)
                    embed = np.repeat(embed, price.shape[0], 0)
                    embed = embed.T                    
                    factor_index = random.sample(list(range(44)), 5)
                    feed_dict = {self.embedding: embed, self.factor: feature.T, self.factor_index: factor_index, self.label: label}
                    new_f= sess.run(self.evaluation, feed_dict=feed_dict)        
                    new_f_total.append(new_f)
                new_f_total = np.array(new_f_total)
                use_index = self.use_index
                ic = np.zeros((44, self.day_sample-self.vali_test, 4))
                for day in range(self.vali_test, self.day_sample):
                    for fac in range(44):
                        for id_idx in range(4):
                            rank_ic = stats.spearmanr(self.data.ar_ic[use_index, day, id_idx], new_f_total[:, day, fac])
                            ic[fac, day-self.vali_test, id_idx] = rank_ic[0]
                ic = ic.mean(axis=1)
                print(ic)     
                print(np.abs(ic).mean())
        self.factor_test = new_f_total 
        self.ic = ic

In [15]:
tf.reset_default_graph()
model = Model(3)
model.data_initial(data)
model.data_split()
model.factor_network()

model.test()

INFO:tensorflow:Restoring parameters from baselineModel-5/logmodel-14.ckpt
[[  2.13792426e-03   2.13009647e-03   8.20993596e-04   7.97427576e-03]
 [ -4.20355192e-03  -5.75383705e-03  -7.99296683e-03  -4.80902855e-03]
 [  2.97236534e-03  -2.37738732e-03  -6.38254894e-03  -7.21020853e-03]
 [ -4.77980524e-03   2.89336733e-03   7.45160966e-03   1.40754564e-02]
 [ -1.31654402e-03  -4.26482686e-03  -7.22463500e-03  -1.49102564e-02]
 [ -1.94287668e-03   7.32067623e-03   1.43481473e-02   1.86254881e-02]
 [  5.63659528e-03   3.44502660e-03   3.03775853e-03  -5.04813771e-03]
 [ -7.87615553e-03  -6.23258417e-03  -3.79035149e-03   8.30654538e-03]
 [  7.75679175e-03   1.33586366e-02   1.56873039e-02   1.84717550e-02]
 [ -3.35472877e-03  -7.00214949e-03  -7.29523113e-03  -9.48015188e-03]
 [  2.07203506e-03  -1.65077235e-03  -4.30194369e-03  -1.19860231e-02]
 [  3.71630114e-03   5.62848036e-03   6.44692081e-03  -2.19756143e-04]
 [  6.60868236e-03   1.99460580e-03   4.50196315e-04  -4.79901392e-03]
 [

In [18]:
model.factor_test.shape

(2776, 244, 44)

In [19]:
def stock2all(a):
    b = a.swapaxes(0, 2)
    c = b.reshape(b.shape[0], -1)
    d = c.T
    return d

a = np.array(range(27)).reshape((3, 3, 3))
b = stock2all(a)


train_x = stock2all(data.feature_data[model.use_index, :model.train_vali, :])
train_y = np.argmax(stock2all(model.data.ar_trend[model.use_index, :model.train_vali, :]), axis=1)
test_x = stock2all(data.feature_data[model.use_index, model.vali_test:, :])
test_y = np.argmax(stock2all(model.data.ar_trend[model.use_index, model.vali_test:, :]), axis=1)

lr = LR.LogisticRegression()
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []

lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))

print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())


train_x = np.concatenate([stock2all(model.factor_test[:, :model.train_vali, :]), train_x], axis=1)
test_x = np.concatenate([stock2all(model.factor_test[:, model.vali_test:, :]), test_x], axis=1)

lr = LR.LogisticRegression()
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []
lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))
    
print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())



accuracy 0.548217744602
f1_score 0.576735179396
accuracy 0.543959937639
f1_score 0.582563947329


In [20]:
ar_trend = np.zeros((len(data.code_tag), 244, 2))
for count, code in enumerate(data.code_tag):
    a_p, a_f = get_data(20160101, 20180000, code)
    a_p = a_p[:244+3]
    for day in range(244):
        if a_p[day+3] > a_p[day]:
            ar_trend[count, day, 1] = 1
        else:
            ar_trend[count, day, 0] = 1

In [21]:
def stock2all(a):
    b = a.swapaxes(0, 2)
    c = b.reshape(b.shape[0], -1)
    d = c.T
    return d

a = np.array(range(27)).reshape((3, 3, 3))
b = stock2all(a)


train_x = stock2all(data.feature_data[model.use_index, :model.train_vali, :])
train_y = np.argmax(stock2all(ar_trend[model.use_index, :model.train_vali, :]), axis=1)
test_x = stock2all(data.feature_data[model.use_index, model.vali_test:, :])
test_y = np.argmax(stock2all(ar_trend[model.use_index, model.vali_test:, :]), axis=1)

lr = LR.LogisticRegression(max_iter=5000)
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []

lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))

print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())


train_x = np.concatenate([stock2all(model.factor_test[:, :model.train_vali, :]), train_x], axis=1)
test_x = np.concatenate([stock2all(model.factor_test[:, model.vali_test:, :]), test_x], axis=1)

lr = LR.LogisticRegression(max_iter=5000)
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []
lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))
    
print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())



accuracy 0.55588888364
f1_score 0.57293263825
accuracy 0.552085793925
f1_score 0.579052524086


In [22]:
ar_trend = np.zeros((len(data.code_tag), 244, 2))
for count, code in enumerate(data.code_tag):
    a_p, a_f = get_data(20160101, 20180000, code)
    a_p = a_p[:244+10]
    for day in range(244):
        if a_p[day+10] > a_p[day]:
            ar_trend[count, day, 1] = 1
        else:
            ar_trend[count, day, 0] = 1

In [23]:
def stock2all(a):
    b = a.swapaxes(0, 2)
    c = b.reshape(b.shape[0], -1)
    d = c.T
    return d

a = np.array(range(27)).reshape((3, 3, 3))
b = stock2all(a)


train_x = stock2all(data.feature_data[model.use_index, :model.train_vali, :])
train_y = np.argmax(stock2all(ar_trend[model.use_index, :model.train_vali, :]), axis=1)
test_x = stock2all(data.feature_data[model.use_index, model.vali_test:, :])
test_y = np.argmax(stock2all(ar_trend[model.use_index, model.vali_test:, :]), axis=1)

lr = LR.LogisticRegression(max_iter=5000)
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []

lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))

print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())


train_x = np.concatenate([stock2all(model.factor_test[:, :model.train_vali, :]), train_x], axis=1)
test_x = np.concatenate([stock2all(model.factor_test[:, model.vali_test:, :]), test_x], axis=1)

lr = LR.LogisticRegression(max_iter=5000)
cf_matrix = np.zeros((2, 2))
accuracy = []
f1_score = []
lr.fit(train_x, train_y)
predict_y = lr.predict(test_x)
cf_matrix += mt.confusion_matrix(test_y, predict_y)
accuracy.append(mt.accuracy_score(test_y, predict_y))
f1_score.append(mt.f1_score(test_y, predict_y))
    
print('accuracy', np.array(accuracy).mean())
print('f1_score', np.array(f1_score).mean())



accuracy 0.534304577881
f1_score 0.596721982152
accuracy 0.529993622148
f1_score 0.599932642669


In [45]:
pd.DataFrame(np.abs(model.final_ic)).to_csv('simple_ic.csv')

In [26]:
for filename in os.listdir('baselineModel-5'):
    piece_name = filename.split('.')
    if piece_name[0] == 'logmodel':
        split1 = piece_name[1].split('-')
        new_name = '.'.join([piece_name[0]+'-'+split1[1], split1[0], piece_name[2]])
        os.rename('baselineModel-5/'+filename, 'baselineModel-5/'+new_name)

In [28]:
for filename in os.listdir('baselineModel-4'):
    piece_name = filename.split('.')
    if piece_name[0] == 'logmodel':
        split1 = piece_name[1].split('-')
        new_name = '.'.join([piece_name[0]+'-'+split1[1], split1[0], piece_name[2]])
        os.rename('baselineModel-4/'+filename, 'baselineModel-4/'+new_name)

In [25]:
for filename in os.listdir('model-1-4'):
    piece_name = filename.split('.')
    if piece_name[0] == 'logmodel':
        split1 = piece_name[1].split('-')
        new_name = '.'.join([piece_name[0]+'-'+split1[1], split1[0], piece_name[2]])
        os.rename('model-1-4/'+filename, 'model-1-4/'+new_name)

In [48]:
np.abs(model.final_ic)[:, 3].mean()

0.010319475452220507