In [67]:
%matplotlib qt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits import mplot3d
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import time

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


## Define Functions

In [123]:
def getLogReturns(dataList):
    logReturns = []
    for t in range(len(dataList)-1):
        logReturn = np.log(dataList[t+1]) - np.log(dataList[t])
        logReturns.append(logReturn)
    return logReturns

def pricesToLogReturns(data):
    dataT = data.T
    logReturns = []
    for stock in dataT.values:
        logReturns.append(getLogReturns(stock))
    return pd.DataFrame(logReturns).T

def getPrincipleComponents(componentCount,inputData):
    pca = PCA(n_components=componentCount)
    principalComponents = pca.fit_transform(inputData)
    principalDf = pd.DataFrame(data = principalComponents)
    return principalDf

def plotPCData(pcData):
    formatedData = pcData.T
    m,n = formatedData.shape
    fig, ax = plt.subplots(figsize=(9,6))
    for i in range(m):
        ax.plot(formatedData.iloc[i])


## Training on PCA

In [145]:
plt.close('all')

# Import data
num_comp = 5 # Choose number of PCA components
raw = pd.read_csv('data_stocks.csv')
testSet = (raw.iloc[0:1000,1:]) # Get rid of date column
testSet = pricesToLogReturns(testSet)
testSet = testSet.values
temp1 = getPrincipleComponents(num_comp,testSet) # Calculate PCA components

# Combine first column of raw data (S&P 500) with PCA components
data = np.zeros((temp1.shape[0],num_comp+1))
data[:,0] = testSet[:,0]
data[:,1:] = temp1 

fig3 = plt.figure()
ax3 = fig3.add_subplot(111)
plotPCData(temp1)
plt.title("PCA Components vs. Time")
plt.xlabel("Time Index")
plt.ylabel("Log Return Value")
plt.show()

# Dimensions of dataset
n = data.shape[0]
p = data.shape[1]


In [146]:
# Training and test data
train_start = 0
train_end = int(np.floor(0.7*n))
test_start = train_end + 1
test_end = int(np.floor(0.9*n))
valid_start = test_end + 1
valid_end = n

data_train = data[np.arange(train_start, train_end), :]
data_test = data[np.arange(test_start, test_end), :]
data_valid = data[np.arange(valid_start, valid_end), :]

# Scale data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

# Build X and y
X_train = data_train[:, 1:]
y_train = data_train[:, 0]
X_test = data_test[:, 1:]
y_test = data_test[:, 0]
X_valid = data_valid[:, 1:]
y_valid = data_valid[:, 0]

# Number of stocks in training data
n_stocks = X_train.shape[1]

# Neurons
n_neurons_1 = 1024
n_neurons_2 = 512
n_neurons_3 = 256
n_neurons_4 = 128

# Session
net = tf.Session()

# Placeholder
X = tf.placeholder(dtype=tf.float32, shape=[None, n_stocks])
Y = tf.placeholder(dtype=tf.float32, shape=[None])

# Initializers
sigma = 1
weight_initializer = tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform", scale=sigma)
bias_initializer = tf.zeros_initializer()

# Hidden weights
W_hidden_1 = tf.Variable(weight_initializer([n_stocks, n_neurons_1]))
bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1]))
W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2]))
bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2]))
W_hidden_3 = tf.Variable(weight_initializer([n_neurons_2, n_neurons_3]))
bias_hidden_3 = tf.Variable(bias_initializer([n_neurons_3]))
W_hidden_4 = tf.Variable(weight_initializer([n_neurons_3, n_neurons_4]))
bias_hidden_4 = tf.Variable(bias_initializer([n_neurons_4]))

# Output weights
W_out = tf.Variable(weight_initializer([n_neurons_4, 1]))
bias_out = tf.Variable(bias_initializer([1]))

# Hidden layer
hidden_1 = tf.nn.relu(tf.add(tf.matmul(X, W_hidden_1), bias_hidden_1))
hidden_2 = tf.nn.relu(tf.add(tf.matmul(hidden_1, W_hidden_2), bias_hidden_2))
hidden_3 = tf.nn.relu(tf.add(tf.matmul(hidden_2, W_hidden_3), bias_hidden_3))
hidden_4 = tf.nn.relu(tf.add(tf.matmul(hidden_3, W_hidden_4), bias_hidden_4))

# Output layer (transpose!)
out = tf.transpose(tf.add(tf.matmul(hidden_4, W_out), bias_out))

# Cost function
mse = tf.reduce_mean(tf.squared_difference(out, Y))

# Optimizer
opt = tf.train.AdamOptimizer().minimize(mse)

# Init
net.run(tf.global_variables_initializer())

# Setup plot
plt.ion()
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
line1, = ax1.plot(y_test,label="Actual Test Data")
line2, = ax1.plot(y_test * 0.5,label="Predicted Test Data")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()

# Fit neural net
batch_size = 256
mse_train = []
mse_test = []
mse_valid = []

# Run
epochs = 100
counter = 0
for e in range(epochs):

    # Shuffle training data
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    X_train = X_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    # Minibatch training
    for i in range(0, len(y_train) // batch_size):
        start = i * batch_size
        batch_x = X_train[start:start + batch_size]
        batch_y = y_train[start:start + batch_size]
        # Run optimizer with batch
        net.run(opt, feed_dict={X: batch_x, Y: batch_y})
        
        # Show progress
        if np.mod(i, 50) == 0:
            counter = counter + 1
            # MSE train and test
            mse_train.append(net.run(mse, feed_dict={X: X_train, Y: y_train}))
            mse_test.append(net.run(mse, feed_dict={X: X_test, Y: y_test}))
            # Prediction
            pred = net.run(out, feed_dict={X: X_test})
            
            line2.set_ydata(pred)
            plt.title('Epoch ' + str(e) + ', Batch ' + str(i))
            plt.show()
            
            if counter == 1:
                initial_pred = pred
            if counter == epochs:
                final_pred_valid = y_valid_pred
                final_pred = pred
            
            plt.pause(0.01)
#             if counter == 1:
#                 plt.pause(10)
#             else:
#                 plt.pause(0.01)
            
plt.ioff()
            
fig4 = plt.figure()
ax4 = fig4.add_subplot(111)
line1, = ax4.plot(y_test,label="Actual Test Data")
line2, = ax4.plot(y_test * 0.5,label="Predicted Test Data")
line2.set_ydata(initial_pred)
plt.title("Predicted Test Data vs. Actual Test Data (Initial Plot)")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()
            
fig5 = plt.figure()
ax5 = fig5.add_subplot(111)
line1, = ax5.plot(y_test,label="Actual Test Data")
line2, = ax5.plot(y_test * 0.5,label="Predicted Test Data")
line2.set_ydata(final_pred)
plt.title("Predicted Test Data vs. Actual Test Data (Final Plot)")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()

fig2 = plt.figure()
ax2 = fig2.add_subplot(111)
line1, = ax2.plot(mse_train,label="Training Error")
line2, = ax2.plot(mse_test,label="Testing Error")
plt.title("Training and Testing Error")
plt.xlabel("Index")
plt.ylabel("Mean Squared Error")
plt.legend(loc="upper left")
plt.show()


## Training on all data (500 stocks)

In [153]:
plt.close('all')

# Import data
raw = pd.read_csv('data_stocks.csv')
testSet = (raw.iloc[0:1000,1:])
data = pricesToLogReturns(testSet)

fig3 = plt.figure()
ax3 = fig3.add_subplot(111)
plotPCData(data)
plt.title("All 500 Stocks vs. Time")
plt.xlabel("Time Index")
plt.ylabel("Log Return Value")
plt.show()

# Dimensions of dataset
n = data.shape[0]
p = data.shape[1]

data = data.values


In [154]:
# Training and test data
train_start = 0
train_end = int(np.floor(0.7*n))
test_start = train_end + 1
test_end = int(np.floor(0.9*n))
valid_start = test_end + 1
valid_end = n

data_train = data[np.arange(train_start, train_end), :]
data_test = data[np.arange(test_start, test_end), :]
data_valid = data[np.arange(valid_start, valid_end), :]

# Scale data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

# Build X and y
X_train = data_train[:, 1:]
y_train = data_train[:, 0]
X_test = data_test[:, 1:]
y_test = data_test[:, 0]
X_valid = data_valid[:, 1:]
y_valid = data_valid[:, 0]

# Number of stocks in training data
n_stocks = X_train.shape[1]

# Neurons
n_neurons_1 = 1024
n_neurons_2 = 512
n_neurons_3 = 256
n_neurons_4 = 128

# Session
net = tf.Session()

# Placeholder
X = tf.placeholder(dtype=tf.float32, shape=[None, n_stocks])
Y = tf.placeholder(dtype=tf.float32, shape=[None])

# Initializers
sigma = 1
weight_initializer = tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform", scale=sigma)
bias_initializer = tf.zeros_initializer()

# Hidden weights
W_hidden_1 = tf.Variable(weight_initializer([n_stocks, n_neurons_1]))
bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1]))
W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2]))
bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2]))
W_hidden_3 = tf.Variable(weight_initializer([n_neurons_2, n_neurons_3]))
bias_hidden_3 = tf.Variable(bias_initializer([n_neurons_3]))
W_hidden_4 = tf.Variable(weight_initializer([n_neurons_3, n_neurons_4]))
bias_hidden_4 = tf.Variable(bias_initializer([n_neurons_4]))

# Output weights
W_out = tf.Variable(weight_initializer([n_neurons_4, 1]))
bias_out = tf.Variable(bias_initializer([1]))

# Hidden layer
hidden_1 = tf.nn.relu(tf.add(tf.matmul(X, W_hidden_1), bias_hidden_1))
hidden_2 = tf.nn.relu(tf.add(tf.matmul(hidden_1, W_hidden_2), bias_hidden_2))
hidden_3 = tf.nn.relu(tf.add(tf.matmul(hidden_2, W_hidden_3), bias_hidden_3))
hidden_4 = tf.nn.relu(tf.add(tf.matmul(hidden_3, W_hidden_4), bias_hidden_4))

# Output layer (transpose!)
out = tf.transpose(tf.add(tf.matmul(hidden_4, W_out), bias_out))

# Cost function
mse = tf.reduce_mean(tf.squared_difference(out, Y))

# Optimizer
opt = tf.train.AdamOptimizer().minimize(mse)

# Init
net.run(tf.global_variables_initializer())

# Setup plot
plt.ion()
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
line1, = ax1.plot(y_test,label="Actual Test Data")
line2, = ax1.plot(y_test * 0.5,label="Predicted Test Data")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()

# Fit neural net
batch_size = 256
mse_train = []
mse_test = []
mse_valid = []

# Run
epochs = 100
counter = 0
for e in range(epochs):

    # Shuffle training data
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    X_train = X_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    # Minibatch training
    for i in range(0, len(y_train) // batch_size):
        start = i * batch_size
        batch_x = X_train[start:start + batch_size]
        batch_y = y_train[start:start + batch_size]
        # Run optimizer with batch
        net.run(opt, feed_dict={X: batch_x, Y: batch_y})
        
        # Show progress
        if np.mod(i, 50) == 0:
            counter = counter + 1
            # MSE train and test
            mse_train.append(net.run(mse, feed_dict={X: X_train, Y: y_train}))
            mse_test.append(net.run(mse, feed_dict={X: X_test, Y: y_test}))
            # Prediction
            pred = net.run(out, feed_dict={X: X_test})
            
            line2.set_ydata(pred)
            plt.title('Epoch ' + str(e) + ', Batch ' + str(i))
            plt.show()
            
            if counter == 1:
                initial_pred = pred
            if counter == epochs:
                final_pred_valid = y_valid_pred
                final_pred = pred
            
            plt.pause(0.01)
#             if counter == 1:
#                 plt.pause(10)
#             else:
#                 plt.pause(0.01)
            
plt.ioff()
            
fig4 = plt.figure()
ax4 = fig4.add_subplot(111)
line1, = ax4.plot(y_test,label="Actual Test Data")
line2, = ax4.plot(y_test * 0.5,label="Predicted Test Data")
line2.set_ydata(initial_pred)
plt.title("Predicted Test Data vs. Actual Test Data (Initial Plot)")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()
            
fig5 = plt.figure()
ax5 = fig5.add_subplot(111)
line1, = ax5.plot(y_test,label="Actual Test Data")
line2, = ax5.plot(y_test * 0.5,label="Predicted Test Data")
line2.set_ydata(final_pred)
plt.title("Predicted Test Data vs. Actual Test Data (Final Plot)")
plt.xlabel("Index")
plt.ylabel("Log Return Value")
plt.legend(loc="upper left")
plt.show()

fig2 = plt.figure()
ax2 = fig2.add_subplot(111)
line1, = ax2.plot(mse_train,label="Training Error")
line2, = ax2.plot(mse_test,label="Testing Error")
plt.title("Training and Testing Error")
plt.xlabel("Index")
plt.ylabel("Mean Squared Error")
plt.legend(loc="upper left")
plt.show()
