In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional,TimeDistributed, concatenate
from keras.layers import GlobalMaxPool1D, GlobalAvgPool1D, Masking
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import TensorBoard
from keras import metrics

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
import scipy
import sys

In [3]:
from process_data import load_data,get_rank_of_size
from data_util import unpack_news_data,DATE_INTERVAL_NEWS,MAX_NEWS_NUM,EMBEDDING_SIZE

In [4]:
rank = get_rank_of_size()

In [5]:
data = load_data(rank[:4])

data of FB is loaded
data of AAPL is loaded
data of GOOGL is loaded
data of AMZN is loaded


In [11]:
data['GOOGL']['train'][0][0][0].toarray().shape()

array([[ 0.11183549, -0.22789039,  0.11041027, ..., -0.49506664,
         0.68054837, -0.03859842],
       [-0.08882826, -0.28037468,  0.47644088, ..., -0.1863821 ,
         0.09091951, -0.27588627],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# baseline model

In [12]:
def build_baseline_model(code='Default'):
    news_input = Input(shape=(DATE_INTERVAL_NEWS,MAX_NEWS_NUM,EMBEDDING_SIZE))
    day_layer = Bidirectional(GRU(30,return_sequences=True))
    inv_layer = Bidirectional(GRU(30,return_sequences=True))
    x = TimeDistributed(day_layer)(news_input)
    x = TimeDistributed(Dropout(0.5))(x)
    x = TimeDistributed(GlobalMaxPool1D())(x)
    x = inv_layer(x)
    x = Dropout(0.5)(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(2, activation='softmax')(x)
    model = Model(inputs=news_input,outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [13]:
model = build_baseline_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [14]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 7, 50, 768)        0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 7, 50, 60)         143820    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 7, 50, 60)         0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 7, 60)             0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 7, 60)             16380     
_________________________________________________________________
dropout_2 (Dropout)          (None, 7, 60)             0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 60)                0         
__________

In [15]:
for i in range(4):
    model.fit(x=unpack_news_data(data[rank[i]]['train'][0]),y=data[rank[i]]['train'][2],\
              batch_size=16,epochs=20,verbose=1,\
    validation_data=(unpack_news_data(data[rank[i]]['val'][0]),data[rank[i]]['val'][2]),\
    callbacks=[TensorBoard(log_dir='model_log/')])

(1143, 7, 50, 768)
(56, 7, 50, 768)
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1143 samples, validate on 56 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
(1148, 7, 50, 768)
(56, 7, 50, 768)
Train on 1148 samples, validate on 56 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

KeyboardInterrupt: 

In [None]:
model.evaluate(x=x_test, y=y_test,batch_size=16)

# NUMERICAL MODEL

In [None]:
numerical_timestep = 20 #  correspond to the 'size' of  the window
attribute_num = 5 # Open/High/Low/AdjClose/Volume 

def build_numerical_model(code='Default'):
    numerical_input = Input(shape=(numerical_timestep,attribute_num))
    x = GRU(50,return_sequences=True)(numerical_input)
    x = Dropout(0.4)(x)
    x = GRU(50)(x)
    x = Dropout(0.4)(x)
    x = Dense(10)(x)
    x = Dense(2,activation='softmax')(x)
    model = Model(inputs=numerical_input,outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
nmodel = build_numerical_model()

In [None]:
print(nmodel.summary())

In [None]:
nmodel.fit(x=data['AMZN']['train'][1],y=data['AMZN']['train'][2],batch_size=16,epochs=50,verbose=1,\
          validation_data=(data['AMZN']['val'][1],data['AMZN']['val'][2]),\
          callbacks=[TensorBoard(log_dir='num_log/')])