# 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import datetime
import os
import matplotlib.pyplot as plt
import tensorflow as tf
print(tf.__version__)

# split data in 80%/10%/10% train/validation/test sets
valid_set_size_percentage = 10 
test_set_size_percentage = 10 

#display parent directory and working directory
print(os.path.dirname(os.getcwd())+':', os.listdir(os.path.dirname(os.getcwd())));
print(os.getcwd()+':', os.listdir(os.getcwd()));

# 2. Analyze Data

In [None]:
# import all stock prices 
df = pd.read_csv("raw_stock/microsoft_stock.csv")
df.info()
df.head()

In [None]:
#df = df.drop(['Dividends','Stock Splits'],1,inplace=True)
#df.describe()
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.index = df['Date']
df.reset_index(drop = True, inplace = True)
df.head()

In [None]:
plt.figure(figsize=(15, 5));
plt.subplot(1,2,1);
plt.plot(df.Date, df.Open.values, color='red', label='open')
plt.plot(df.Date, df.Close.values, color='green', label='close')
#plt.plot(df.Low.values, color='blue', label='low')
#plt.plot(df.High.values, color='black', label='high')
plt.title('stock price')
plt.xlabel('year')
plt.ylabel('price')
plt.legend(loc='best')
#plt.show()

plt.subplot(1,2,2);
plt.plot(df.Date, df.Volume.values, color='black', label='volume')
plt.title('stock volume')
plt.xlabel('year')
plt.ylabel('volume')
plt.legend(loc='best');

# 3. Manipulate Data

In [None]:
# function for min-max normalization of stock
from sklearn.preprocessing import MinMaxScaler

def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    df['Open'] = min_max_scaler.fit_transform(df.Open.values.reshape(-1,1))
    df['High'] = min_max_scaler.fit_transform(df.High.values.reshape(-1,1))
    df['Low'] = min_max_scaler.fit_transform(df.Low.values.reshape(-1,1))
    df['Close'] = min_max_scaler.fit_transform(df.Close.values.reshape(-1,1))
    return df

In [None]:
# function to create train, validation, test data given stock data and sequence length
def load_data(stock, seq_len):
    data_raw = stock.as_matrix() # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len): 
        data.append(data_raw[index: index + seq_len])
    data = np.array(data);
    valid_set_size = int(np.round(valid_set_size_percentage/100*data.shape[0]));  
    test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]));
    train_set_size = data.shape[0] - (valid_set_size + test_set_size);
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
    y_valid = data[train_set_size:train_set_size+valid_set_size,-1,:]
    
    x_test = data[train_set_size+valid_set_size:,:-1,:]
    y_test = data[train_set_size+valid_set_size:,-1,:]
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]

In [None]:
# choose one stock
df_stock = df.copy()
df_stock.drop(['Date'],1,inplace=True)
df_stock.drop(['Volume'],1,inplace=True)
df_stock.drop(['Dividends'],1,inplace=True)
df_stock.drop(['Stock Splits'],1,inplace=True)
df_stock.drop(['Change'],1,inplace=True)
df_stock.drop(['%Change'],1,inplace=True)
df_stock.drop(['Result'],1,inplace=True)

cols = list(df_stock.columns.values)
print('df_stock.columns.values = ', cols)

# normalize stock
#df_stock_norm = df_stock.copy()
df_stock_norm = normalize_data(df_stock)
df_stock_norm

# create train, test data
seq_len = 20 # choose sequence length
X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(df_stock_norm, seq_len)
print('X_train.shape = ',X_train.shape)
print('y_train.shape = ', y_train.shape)
print('X_valid.shape = ',X_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('X_test.shape = ', X_test.shape)
print('y_test.shape = ',y_test.shape)

In [None]:
plt.figure(figsize=(15, 5));
plt.plot(df.Date, df_stock_norm.Open.values, color='red', label='open')
plt.plot(df.Date, df_stock_norm.Close.values, color='green', label='close')
plt.plot(df.Date, df_stock_norm.Low.values, color='blue', label='low')
plt.plot(df.Date, df_stock_norm.High.values, color='black', label='high')
#plt.plot(df.index, df_stock_norm.Volume.values, color='gray', label='volume')
plt.title('stock')
plt.xlabel('year')
plt.ylabel('normalized price')
plt.legend(loc='best')
plt.show()

# 4. Model and Validate Data

In [None]:
index_in_epoch = 0;
perm_array  = np.arange(X_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, X_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > X_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return X_train[perm_array[start:end]], y_train[perm_array[start:end]]

In [None]:
import tensorflow.compat.v1 as tf_v1
tf_v1.disable_v2_behavior() ##print(tf.__version__)
# parameters
n_steps = seq_len-1 
n_inputs = 4 
n_neurons = 200 
n_outputs = 4
n_layers = 2
learning_rate = 0.001
batch_size = 50
n_epochs = 100 
train_set_size = X_train.shape[0]
test_set_size = X_test.shape[0]

tf_v1.reset_default_graph()

X = tf_v1.placeholder(tf_v1.float32, [None, n_steps, n_inputs])
y = tf_v1.placeholder(tf_v1.float32, [None, n_outputs])


In [None]:
from tensorflow.keras.layers import Dense
# use Basic RNN Cell
#layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf_v1.nn.elu)
#          for layer in range(n_layers)]
layers = [tf_v1.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf_v1.nn.elu)
          for layer in range(n_layers)]
# use Basic LSTM Cell 
#layers = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons, activation=tf.nn.elu)
#          for layer in range(n_layers)]

# use LSTM Cell with peephole connections
#layers = [tf.contrib.rnn.LSTMCell(num_units=n_neurons, 
#                                  activation=tf.nn.leaky_relu, use_peepholes = True)
#          for layer in range(n_layers)]

# use GRU cell
#layers = [tf.contrib.rnn.GRUCell(num_units=n_neurons, activation=tf.nn.leaky_relu)
#          for layer in range(n_layers)]
                                           

In [None]:
multi_layer_cell = tf_v1.nn.rnn_cell.MultiRNNCell(layers)
rnn_outputs, states = tf_v1.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

stacked_rnn_outputs = tf_v1.reshape(rnn_outputs, [-1, n_neurons]) 
stacked_outputs = tf_v1.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf_v1.reshape(stacked_outputs, [-1, n_steps, n_outputs])
outputs = outputs[:,n_steps-1,:] # keep only last output of sequence
                                              

In [None]:
loss = tf.reduce_mean(tf.square(outputs - y)) # loss function = mean squared error 
optimizer = tf_v1.train.AdamOptimizer(learning_rate=learning_rate) 
training_op = optimizer.minimize(loss)
         
# run graph
with tf_v1.Session() as sess: 
    sess.run(tf_v1.global_variables_initializer())
    for iteration in range(int(n_epochs*train_set_size/batch_size)):
        X_batch, y_batch = get_next_batch(batch_size) # fetch the next training batch 
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) 
        if iteration % int(5*train_set_size/batch_size) == 0:
            mse_train = loss.eval(feed_dict={X: X_train, y: y_train}) 
            mse_valid = loss.eval(feed_dict={X: X_valid, y: y_valid}) 
            print('%.2f epochs: MSE train/valid = %.6f/%.6f'%(
                iteration*batch_size/train_set_size, mse_train, mse_valid))

    y_train_pred = sess.run(outputs, feed_dict={X: X_train})
    y_valid_pred = sess.run(outputs, feed_dict={X: X_valid})
    y_test_pred = sess.run(outputs, feed_dict={X: X_test})

# 5. Predictions

In [None]:
y_train_pred.shape

In [None]:
ft = 0 # 0 = open, 1 = highest, 2 = lowest, 3 = close

## show predictions

plt.figure(figsize=(15, 5))
plt.subplot(1,2,1)

plt.plot(np.arange(y_train.shape[0]), y_train[:,ft], color='blue', label='train target')

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_valid.shape[0]), y_valid[:,ft],
         color='gray', label='valid target')

plt.plot(np.arange(y_train.shape[0]+y_valid.shape[0],
                   y_train.shape[0]+y_test.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')
plt.plot(np.arange(y_train_pred.shape[0]),y_train_pred[:,ft], color='red',
         label='train prediction')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_valid_pred.shape[0]),
         y_valid_pred[:,ft], color='orange', label='valid prediction')

plt.plot(np.arange(y_train_pred.shape[0]+y_valid_pred.shape[0],
                   y_train_pred.shape[0]+y_valid_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('past and future stock prices')
plt.xlabel('days [2014-7-18]')
plt.ylabel('normalized price')
plt.legend(loc='best');
plt.subplot(1,2,2);

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('future stock prices')
plt.xlabel('days [2014-7-18]')
plt.ylabel('normalized price')
plt.legend(loc='best');

In [None]:
corr_price_development_train = np.sum(np.equal(np.sign(y_train[:,1]-y_train[:,0]),
            np.sign(y_train_pred[:,1]-y_train_pred[:,0])).astype(int)) / y_train.shape[0]
corr_price_development_valid = np.sum(np.equal(np.sign(y_valid[:,1]-y_valid[:,0]),
            np.sign(y_valid_pred[:,1]-y_valid_pred[:,0])).astype(int)) / y_valid.shape[0]
corr_price_development_test = np.sum(np.equal(np.sign(y_test[:,1]-y_test[:,0]),
            np.sign(y_test_pred[:,1]-y_test_pred[:,0])).astype(int)) / y_test.shape[0]

print('correct sign prediction for close - open price for train/valid/test: %.2f/%.2f/%.2f'%(
    corr_price_development_train, corr_price_development_valid, corr_price_development_test))