# RNN Vector Version

### Step 1: Import all the packages needed

In [1]:
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib as mp
import argparse
import os, sys
import csv
import math
import time
import matplotlib.pyplot as pl

#### Step 2.1: setting all global parameters -- sec 1 data parameters

In [2]:
time1 = time.time()
data_path = './input.csv'
total_days = 350
train_days = 280
test_days = 70
data_length = 0

#### Step 2.2: setting all global parameters -- sec 2 network configuration

In [3]:
num_epoches = 10
n_steps = 4*7 # input size
batch_size = 70 # days of a batch
feature_size = 48 # same time of a week
n_hidden = 10 # input size
num_layers = 2
n_output = 48

### Step 3:loading data

In [4]:
dataframe = pd.read_csv(data_path)
dat = np.array(dataframe)
date_list = dat[:,1]
dat = dat[:,2:]# drop the first two cols --- index and date
nrows,ncols = dat.shape
#print nrows,ncols
data = dat.reshape((1,nrows*ncols))
data_length = data.shape[1]

# construct training data
train_len = train_days*feature_size
train_data = data[0,0:train_len]
train_data = train_data.reshape([train_len/feature_size,feature_size])#days * 48 points

# construct testing data
## test size = input_size + test days size. since, the output should be 
## from first test sample to last. prefix is input-size data
test_len = test_days*feature_size
test_data = np.zeros(test_len+n_steps*feature_size)
test_data[n_steps*feature_size:] = data[0,train_len:train_len+test_len]
test_data[0:n_steps*feature_size] = data[0,train_len-n_steps*feature_size:train_len]
test_data = test_data.reshape([test_days+n_steps,feature_size])
print "train data shape: {}, test data shape: {}".format(train_data.shape,test_data.shape)

train data shape: (280, 48), test data shape: (98, 48)


In [5]:
# check data
xxx = np.arange(0,48*70)
pl.plot(xxx,train_data[0:70,:].reshape([-1,1]),label = "train",color = "red")
pl.plot(xxx,test_data[28:98,:].reshape([-1,1]),label = "test",color = "purple")
pl.grid()
pl.legend()
pl.show()

### Step 4: define data generating function code. 
which generate a batch of batch-size large sequence data. the data is feature_size dims width and is a time series of float32 of steps steps. inputs and outputs are:

inputs:
----n_batch: number of samples in a batch
----steps: the sequence length of a sample data
----feature_size: dimensions of a single time step data frame

outputs:
----X inputs, shape(n_batch,steps,feature_size)
----Y outputs should be, shape(n_batch,)

In [6]:
def train_data_gen(steps = 28, n_batch = 70):
    X = np.zeros((n_batch,steps,feature_size))
    Y = np.zeros((n_batch,feature_size))
    #for each n, compute X and correct y values
    for n in range(n_batch):
        # randomly pick a sample's y, between acceptable range
        index = np.random.randint(steps,train_days)
        # update y
        Y[n] = train_data[index,:]
        # update X from index-steps to index-1
        X[n,:,:] = train_data[index-steps:index,:]
    return (X,Y)

In [7]:
def test_data_gen(steps = 28, n_batch = 70):
    X = np.zeros((n_batch,steps,feature_size))
    Y = np.zeros((n_batch,feature_size))
    #for each n, compute X and correct y values
    for n in range(n_batch):
        # update y
        Y[n] = test_data[steps+n,:]
        # update X from index-steps to index-1
        X[n,:,:] = test_data[n:n+steps,:]
    return (X,Y)

### Step 5: construct RNN model

In [9]:
# create placeholder for x and y
x = tf.placeholder("float",[None,n_steps,feature_size])
istate = tf.placeholder("float",[None,num_layers*2*n_hidden])
y = tf.placeholder("float",[None,n_output])


# Define weights
weights = {
    'hidden': tf.Variable(tf.random_normal([feature_size, n_hidden])), # Hidden layer weights
    'out': tf.Variable(tf.random_normal([n_hidden, n_output]))
}
biases = {
    'hidden': tf.Variable(tf.random_normal([n_hidden])),
    'out': tf.Variable(tf.random_normal([n_output]))
}

In [10]:
def RNN(_X, _istate, _weights, _biases):

    # input shape: (batch_size, n_steps, n_input)
    _X = tf.transpose(_X, [1, 0, 2])  # permute n_steps and batch_size
    # Reshape to prepare input to hidden activation
    _X = tf.reshape(_X, [-1, feature_size]) # (n_steps*batch_size, n_input)
    # Linear activation
    _X = tf.matmul(_X, _weights['hidden']) + _biases['hidden']

    # Define a lstm cell with tensorflow
    lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
    stacked_lstm_cell = rnn_cell.MultiRNNCell([lstm_cell]*num_layers)
    
    # Split data because rnn cell needs a list of inputs for the RNN inner loop
    _X = tf.split(0, n_steps, _X) # n_steps * (batch_size, n_hidden)

    # Get lstm cell output
    outputs, states = rnn.rnn(stacked_lstm_cell, _X, initial_state=_istate)

    # Linear activation
    # Get inner loop last output
    return tf.matmul(outputs[-1], _weights['out']) + _biases['out']

In [11]:
pred = RNN(x, istate, weights, biases)

#cost function 
cost = tf.reduce_mean(tf.pow(pred-y,2)) # cost function of this batch of data
#cost2 = tf.abs(cost1)
#compute parameter updates
#train_op = tf.train.GradientDescentOptimizer(0.008).minimize(cost)
optimizer = tf.train.RMSPropOptimizer(0.005, 0.3).minimize(cost)

### Step 6: generate validation data

In [12]:
x_val,y_val = test_data_gen(n_steps,batch_size)
print "test data: x_val shape - {}; y_val shape - {}".format(x_val.shape,y_val.shape)

test data: x_val shape - (70, 28, 48); y_val shape - (70, 48)


### Step 7: run rnn network

In [13]:
### Execute
# Initializing the variables
init = tf.initialize_all_variables()
outp = []
with tf.Session() as sess:
    # Create a summary to monitor cost function
    tf.scalar_summary("loss", cost)
    #tf.scalar_summary("loss2",cost2)
    # Merge all summaries to a single operator
    merged_summary_op = tf.merge_all_summaries()

    # tensorboard info.# Set logs writer into folder /tmp/tensorflow_logs
    summary_writer = tf.train.SummaryWriter('/tmp/tensorflow_logs', graph_def=sess.graph_def)
    
    #initialize all variables in the model
    sess.run(init)
    for k in range(num_epoches):
        #Generate Data for each epoch
        #What this does is it creates a list of of elements of length seq_len, each of size [batch_size,input_size]
        #this is required to feed data into rnn.rnn
        X,Y = train_data_gen(n_steps,batch_size)
        X = X.reshape(batch_size,n_steps,feature_size)
        #Create the dictionary of inputs to feed into sess.run
        sess.run(optimizer,feed_dict={x:X,y:Y,istate:np.zeros((batch_size,num_layers*2*n_hidden))})   
        #perform an update on the parameters
        
        loss1 = sess.run(cost, feed_dict = {x:X,y:Y,istate:np.zeros((batch_size,num_layers*2*n_hidden))} )
        loss2 = sess.run(cost, feed_dict = {x:x_val,y:y_val,istate:np.zeros((batch_size,num_layers*2*n_hidden))} )            #compute the cost on the validation set
        output_tmp = sess.run(pred,feed_dict = {x:X,y:Y,istate:np.zeros((batch_size,num_layers*2*n_hidden))} )
        outp_train = output_tmp
        output_tmp = sess.run(pred,feed_dict = {x:x_val,y:y_val,istate:np.zeros((batch_size,num_layers*2*n_hidden))} )
        outp_test = output_tmp
            
        # Write logs at every iteration
        summary_str = sess.run(merged_summary_op, feed_dict={x:x_val,y:y_val,istate:np.zeros((batch_size,num_layers*2*n_hidden))} )
        summary_writer.add_summary(summary_str, k)
        print "Iter " + str(k) + ", Minibatch Loss ---- Train = " + "{:.6f}".format(loss1) + "; Test = " + "{:.6f}".format(loss2)
    #print "haha{}".format(outp)

Iter 0, Minibatch Loss ---- Train = 1.106178; Test = 1.198850
Iter 1, Minibatch Loss ---- Train = 1.052538; Test = 1.158478
Iter 2, Minibatch Loss ---- Train = 1.002313; Test = 1.109926
Iter 3, Minibatch Loss ---- Train = 0.933537; Test = 1.053651
Iter 4, Minibatch Loss ---- Train = 0.873730; Test = 0.990542
Iter 5, Minibatch Loss ---- Train = 0.804134; Test = 0.926600
Iter 6, Minibatch Loss ---- Train = 0.736910; Test = 0.868168
Iter 7, Minibatch Loss ---- Train = 0.669794; Test = 0.817586
Iter 8, Minibatch Loss ---- Train = 0.640141; Test = 0.775528
Iter 9, Minibatch Loss ---- Train = 0.597042; Test = 0.748017


### Step 8: Evaluation

In [14]:
out = np.array(outp_test)
out = out.reshape([-1,1])
out.shape

(3360, 1)

In [15]:
y_val = y_val.reshape([-1,1])
y_val.dtype = float
y_val.shape

(3360, 1)

In [16]:
R = np.corrcoef(out.T,y_val.T)
RR = R**2

In [17]:
# final R
R

array([[ 1.       ,  0.1195277],
       [ 0.1195277,  1.       ]])

In [18]:
# final R-square
RR

array([[ 1.        ,  0.01428687],
       [ 0.01428687,  1.        ]])

In [19]:
xxx = np.arange(0,test_len)
pl.plot(xxx,out,label = "predict",color = "red")
pl.plot(xxx,y_val,label = "reality",color = "purple")
pl.grid()
pl.legend()
pl.show()

In [20]:
# final MSE
sq = pow(out-y_val,2)
np.mean(sq)

0.748017512806611

In [21]:
# run time
time2 = time.time()
print time2-time1

47.5641560555
