In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
NUM_DATA_FILE = 'data2/prices/stockPrices_AMZN.csv'

In [None]:
num_df = pd.read_csv(NUM_DATA_FILE)
num_df['Date'] = pd.to_datetime(num_df['Date'])
num_df.sort_values('Date',inplace=True)

In [None]:
num_df.head()

In [None]:
attribute = 'Open'
plt.plot(num_df[1000:]['Date'],num_df[1000:][attribute])
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel(attribute)
plt.show()

In [None]:
# divide data in to three groups
num_test = num_df[num_df['Date'] >= pd.Timestamp(2019,1,1)].values # test_set
tmp = num_df[num_df['Date'] < pd.Timestamp(2019,1,1)]
num_dev = tmp[tmp['Date'] >= pd.Timestamp(2019,9,1)].values # development_set
num_train = tmp[tmp['Date'] < pd.Timestamp(2019,9,1)].values # train_set
del tmp

In [None]:
# normalize the input data to make RNN work better
def normalize(arr2d):
    arr2d = arr2d.astype('float64')
    n_arr2d = None
    for j in range(arr2d.shape[1]):
        scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        values = arr2d[:,j]
        values = values.reshape(-1,1)
        scaler.fit(values)
        if n_arr2d is None:
            n_arr2d = scaler.transform(values)
        else:
            n_arr2d = np.concatenate((n_arr2d,scaler.transform(values)),axis=1)
    return n_arr2d

def get_x_by_sw(data_set,size=20):
    # with sliding_window
    data_dict = dict()
    left = right = 0 #including left but not right
    for i in range(len(data_set)):
        if right>=left+size:
            data_dict[data_set[right][0]] = normalize(np.delete(data_set[left:right],0,axis=1))
            left += 1
        right += 1
    return data_dict

def get_y(data_set):
    data_dict =dict()
    for i in range(len(data_set)):
        if i > 0:
            data_dict[data_set[i][0]] = data_set[i][1]/data_set[i-1][1]-1
    return data_dict

def match_xy(x_dict,y_dict):
    x_list = list()
    y_list = list()
    for key in x_dict.keys():
        if key in y_dict:
            x_list.append(x_dict[key])
            y_list.append(y_dict[key])
    x_arr = np.array(x_list)
    y_arr = np.array(y_list)
    return (x_arr,y_arr)

def get_xy(data_set):
    x_dict = get_x_by_sw(data_set)
    y_dict = get_y(data_set)
    return match_xy(x_dict,y_dict)

In [None]:
(x_test,y_test) = get_xy(num_test)
(x_train,y_train) = get_xy(num_train)
(x_dev,y_dev) = get_xy(num_dev)