In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Activation, GRU
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import Bidirectional,TimeDistributed, concatenate
from keras.layers import GlobalMaxPool1D, GlobalAvgPool1D, Masking
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras.callbacks import TensorBoard
from keras import metrics

In [None]:
import pandas as pd
import numpy as np
import scipy
import sys

In [None]:
from process_data import load_data,get_rank_of_size
from layers import AttentionLayer,MyMeanPool
from callbacks import MetricsEx
from data_util import unpack_news_data,data_generator,DATE_INTERVAL_NEWS,MAX_NEWS_NUM,EMBEDDING_SIZE

In [None]:
rank = get_rank_of_size()
rank.remove('GOOG')

In [None]:
print(rank)

In [None]:
data = load_data(rank[:20])

In [None]:
# sparse matrix
data[rank[0]]['train'][0][0][0].toarray().shape

# model

In [None]:
def build_model(code='Default'):
    news_input = Input(shape=(DATE_INTERVAL_NEWS,MAX_NEWS_NUM,EMBEDDING_SIZE))    
    
    x = news_input
    x = TimeDistributed(Masking(mask_value=0.))(x)
    x = TimeDistributed(AttentionLayer())(x)
    x = TimeDistributed(Dense(100))(x)
    
    x = Bidirectional(GRU(50,return_sequences=True))(x)
    x = AttentionLayer()(x)
    x = Dropout(0.5)(x)
    
    x = Dense(2, activation='softmax')(x)
    model = Model(inputs=news_input,outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
print(model.summary())

In [None]:
model = build_model()

In [None]:
b_size = 32
x_train = np.concatenate([data[key]['train'][0] for key in data],axis=0)
y_train = np.concatenate([data[key]['train'][2] for key in data],axis=0)
cbs = [TensorBoard(log_dir='model_log/'),MetricsEx('f1')]
steps = (len(x_train)+b_size-1)//b_size

In [None]:
model.fit_generator(generator=data_generator(b_size,(x_train,y_train),unpack_news_data),
                    samples_per_epoch=steps,
                    validation_data=(unpack_news_data(data['AMZN']['val'][0]),data['AMZN']['val'][2]),
                    callbacks = cbs,
                    epochs=10,
                    verbose=1,
                   )

In [None]:
model2.evaluate(x=unpack_news_data(data[rank[2]]['val'][0]), y=data[rank[2]]['val'][2],batch_size=32)

# NUMERICAL MODEL

In [None]:
numerical_timestep = 20 #  correspond to the 'size' of  the window
attribute_num = 5 # Open/High/Low/AdjClose/Volume 

def build_numerical_model(code='Default'):
    numerical_input = Input(shape=(numerical_timestep,attribute_num))
    x = GRU(100,return_sequences=True)(numerical_input)
    x = Dropout(0.5)(x)
    x = GRU(100)(x)
    x = Dropout(0.5)(x)
    x = Dense(2,activation='softmax')(x)
    model = Model(inputs=numerical_input,outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
nmodel = build_numerical_model()

In [None]:
print(nmodel.summary())

In [None]:
b_size = 32
x_train = np.concatenate([data[key]['train'][1] for key in data],axis=0)
y_train = np.concatenate([data[key]['train'][2] for key in data],axis=0)
cbs = [TensorBoard(log_dir='num_log/'),MetricsEx('f1')]
steps = (len(x_train)+b_size-1)//b_size

In [None]:
nmodel.fit_generator(generator=data_generator(b_size,(x_train,y_train)),
                    samples_per_epoch=steps,
                    validation_data=(data['FB']['val'][1],data['FB']['val'][2]),
                    callbacks = cbs,
                    epochs=30,
                    verbose=1,
                   )

# Hybrid model

In [None]:
numerical_timestep = 20 #  correspond to the 'size' of  the window
attribute_num = 5 # Open/High/Low/AdjClose/Volume 

def build_hybrid_model():
    numerical_input = Input(shape=(numerical_timestep,attribute_num))
    textual_input = Input(shape=(DATE_INTERVAL_NEWS,MAX_NEWS_NUM,EMBEDDING_SIZE))
    
    x1 = textual_input
    x1 = TimeDistributed(Masking(mask_value=0.))(x1)
    x1 = TimeDistributed(AttentionLayer())(x1)
    x1 = TimeDistributed(Dropout(0.2))(x1)
    x1 = TimeDistributed(Dense(100))(x1)
    x1 = Bidirectional(GRU(50,return_sequences=True))(x1)
    x1 = AttentionLayer()(x1)
    x1 = Dropout(0.5)(x1)
    x1 = Dense(10)(x1)
    
    x2 = numerical_input
    x2 = GRU(100,return_sequences=True)(x2)
    x2 = Dropout(0.5)(x2)
    x2 = GRU(100)(x2)
    x2 = Dropout(0.5)(x2)
    x2 = Dense(50)(x2)
    
    x = concatenate([x1,x2])
    x = Dense(2,activation='softmax')(x)
    model = Model(inputs=[textual_input, numerical_input],outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
hmodel = build_hybrid_model()

In [None]:
b_size = 64
x1_train = np.concatenate([data[key]['train'][0] for key in data],axis=0)
x2_train = np.concatenate([data[key]['train'][1] for key in data],axis=0)
x1_val = unpack_news_data(np.concatenate([data[key]['val'][0] for key in data],axis=0))
x2_val = np.concatenate([data[key]['val'][1] for key in data],axis=0)
y_val = np.concatenate([data[key]['val'][2] for key in data],axis=0)
y_train = np.concatenate([data[key]['train'][2] for key in data],axis=0)
cbs = [TensorBoard(log_dir='num_log/')]
steps = (len(x1_train)+b_size-1)//b_size

In [None]:
hmodel.fit_generator(generator=data_generator(b_size,(x1_train,x2_train,y_train),unpack_news_data),
                    samples_per_epoch=steps,
                    validation_data=([x1_val,x2_val],y_val),
                    callbacks = cbs,
                    epochs=5,
                    verbose=1,
                   )

In [None]:
result_dict = {}
for code in rank[:20]:
    result = hmodel.evaluate(x=[unpack_news_data(data[code]['test'][0]),data[code]['test'][1]],
                             y=data[code]['test'][2],batch_size=32)
    result_dict[code] = result[1]
    if 'avg' not in result_dict:
        result_dict['avg'] = 0
    result_dict['avg'] += result[1]
result_dict['avg'] /= 20
print(result_dict)

# Random Guess