In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
NUM_DATA_FILE = 'data2/prices/stockPrices_GOOGL.csv'

In [None]:
num_df = pd.read_csv(NUM_DATA_FILE)
num_df['Date'] = pd.to_datetime(num_df['Date'])
num_df.sort_values('Date',inplace=True)

In [None]:
num_df.head()

In [None]:
attribute = 'Open'
plt.plot(num_df[1000:]['Date'],num_df[1000:][attribute])
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel(attribute)
plt.show()

In [None]:
# divide data in to three groups
num_test = num_df[num_df['Date'] >= pd.Timestamp(2019,1,1)].values # test_set
tmp = num_df[num_df['Date'] < pd.Timestamp(2019,1,1)]
num_dev = tmp[tmp['Date'] >= pd.Timestamp(2018,9,1)].values # development_set
num_train = tmp[tmp['Date'] < pd.Timestamp(2018,9,1)].values # train_set
del tmp

In [None]:
# normalize the input data to make RNN work better
def normalize(arr2d):
    arr2d = arr2d.astype('float64')
    n_arr2d = None
    #for j in range(arr2d.shape[1]):
        #scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        #values = arr2d[:,j]
        #values = values.reshape(-1,1)
        #scaler.fit(values)
        #if n_arr2d is None:
            #n_arr2d = scaler.transform(values)
        #else:
            #n_arr2d = np.concatenate((n_arr2d,scaler.transform(values)),axis=1)
            
    # divide into two group: prices(Open,Low,High,adjClose) and volume
    a_num = arr2d.shape[1]
    p_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
    p_values = arr2d[:,0:a_num-1].reshape(-1,1)
    p_scaler.fit(p_values)
    v_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
    v_values = arr2d[:,a_num-1].reshape(-1,1)
    v_scaler.fit(v_values)
    for j in range(a_num):
        scaler = p_scaler
        if j == a_num-1:
            scaler = v_scaler
        values = arr2d[:,j]
        values = values.reshape(-1,1)
        if n_arr2d is None:
            n_arr2d = scaler.transform(values)
        else:
            n_arr2d = np.concatenate((n_arr2d,scaler.transform(values)),axis=1)
    return n_arr2d

def get_x_by_sw(data_set,size=20):
    # with sliding_window
    data_dict = dict()
    #including left but not right
    left = right = 0
    for i in range(len(data_set)):
        if right>=left+size:
            data_dict[data_set[right][0]] = \
            normalize(np.delete(data_set[left:right],[0,4],axis=1)) #remove 'Date' and 'Close'
            left += 1
        right += 1
    return data_dict

def get_y(data_set):
    data_dict =dict()
    len9 = len(data_set)
    for i in range(len9):
        if i > 0:
            # How to define the change rate?
            # Now set rate = open_price[1]/open_prices[i-1]-1
            rate = data_set[i][1]/data_set[i-1][1]-1
            if rate <=0:
                data_dict[data_set[i][0]] = [1,0]
            else:
                data_dict[data_set[i][0]] = [0,1]
    return data_dict

def match_xy(x_dict,y_dict):
    x_list = list()
    y_list = list()
    for key in x_dict.keys():
        if key in y_dict:
            x_list.append(x_dict[key])
            y_list.append(y_dict[key])
    x_arr = np.array(x_list)
    y_arr = np.array(y_list)
    return (x_arr,y_arr)

def get_xy(data_set):
    x_dict = get_x_by_sw(data_set)
    y_dict = get_y(data_set)
    return match_xy(x_dict,y_dict)

In [None]:
(x_test,y_test) = get_xy(num_test)
(x_train,y_train) = get_xy(num_train)
(x_dev,y_dev) = get_xy(num_dev)

In [None]:
# import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed,concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
numerical_timestep = 20 #  correspond to the 'size' of  the window
attribute_num = 5 # Open/High/Low/AdjClose/Volume 

def build_model(code='Default'):
    numerical_input = Input(shape=(numerical_timestep,attribute_num))
    x = GRU(50,return_sequences=True)(numerical_input )
    x = Dropout(0.2)(x)
    x = GRU(50)(x)
    x = Dropout(0.2)(x)
    # x = Dense(10)(x)
    x = Dense(2,activation='softmax')(x)
    model = Model(inputs=numerical_input,outputs=x)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
model = build_model()

In [None]:
print(model.summary())

In [None]:
model.fit(x=x_train,y=y_train,batch_size=16,epochs=40,verbose=1,validation_data=(x_dev,y_dev))

In [None]:
model.evaluate(x=x_test,y=y_test,batch_size=16)