In [1]:
import numpy as np
import get
import matplotlib.pyplot as plt
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.losses import MeanSquaredError

In [2]:
import numpy

In [3]:
client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:33219  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB


# Before We Get Started

First we need some basic helper functions and the daily S&P data from the past several years

In [4]:
#Getting the S&P data is pretty straightforward


SPY = get.daily('SPY', outputsize = 'full')
SPY = dd.from_pandas(SPY, npartitions = len(SPY.columns))#Moving to dask so things move faster

In [5]:
#Take a quick look at what we have here
SPY.head()

Unnamed: 0,timestamp,open,high,low,close,adjusted_close,volume,dividend_amount,split_coefficient
0,2020-09-11,335.82,336.97,331.0,334.06,334.06,84680194,0.0,1.0
1,2020-09-10,341.82,342.53,332.85,333.89,333.89,90569548,0.0,1.0
2,2020-09-09,337.55,342.46,336.61,339.79,339.79,91462290,0.0,1.0
3,2020-09-08,336.71,342.64,332.88,333.21,333.21,114465322,0.0,1.0
4,2020-09-04,346.13,347.83,334.87,342.57,342.57,139156281,0.0,1.0


In [6]:
#Lets see when it starts so we can get a better grip on what we have to work on. I'd like to use '89, the dot-bomb crash, '08, and '18
max_date = SPY.loc[SPY.index == max(SPY.index)].compute()
print(max_date.timestamp)

5249    1999-11-01
Name: timestamp, dtype: object


This means we will be able to trade on all of those crashes except '89. I was able to figure out which dates to use on another application.
* Dot-Bomb: 8-28-2000- 7-9-2007
* '08: 10-1-2007 - 2-4-2013
* '18: 9-24-18 - 4-29-2019


In [7]:
#Lets make this less code:
date_ranges = [
    {'start':'2000-08-28', 'end':'2007-07-09'},
    {'start':'2007-10-01', 'end':'2013-02-04'},
    {'start':'2018-09-24', 'end':'2019-04-29'},
    {'start':'2020-03-17', 'end':'2020-03-30'}#Testing Data. Life is easier if I just process it now
]

In [8]:
crash_data = []
for crash in date_ranges:
    selected = SPY[SPY.timestamp >= crash['start']]
    selected = selected[selected.timestamp <= crash['end']]
    selected = selected.set_index(selected.timestamp)

    #Select the data we wanrt
    pre_selected_spy = selected[['open', 'high','low',  'volume', 'adjusted_close']]
    selected_spy = pre_selected_spy.to_dask_array()
    sc = MinMaxScaler(feature_range=(0, 1))
    scaled_SPY = sc.fit_transform(selected_spy.compute())

    #timestep of 5 days
    x_train = list() 
    y_train = list()
    for i in range(5, scaled_SPY.shape[0]-1):
        x_train.append(scaled_SPY[i-5:i, 0:4])
        y_train.append(scaled_SPY[i+1, 4])    
    x_t = np.array(x_train)
    y_t = np.array(y_train)
    print(x_t.shape)
    print(y_t.shape)
    crash_data.append({'x_t': x_t, 'y_t': y_t})
test_data = crash_data.pop()#Pulling out the test data

(1717, 5, 4)
(1717,)
(1340, 5, 4)
(1340,)
(143, 5, 4)
(143,)
(4, 5, 4)
(4,)


In [9]:
%matplotlib qt
#check that I did that right
for crash in crash_data:
    plt.figure()
    plt.plot(crash['x_t'][0])
    plt.plot(crash['y_t'])
    plt.title('SPY Behavior')
    plt.ylabel('Price (USD)')
    plt.xlabel('Days')
    plt.legend(['Open', 'High'])

Time to create predictor. This architecture has taken quite a bit of fiddling, but finally has worked. The h5 should be in this repo if you would like to use the network yourself

In [10]:
model = Sequential(layers = [
    LSTM(50, input_shape = (5, 4)), Dropout(0.2),
    Dense(50, activation = 'sigmoid', use_bias = False),
    Dropout(0.3),
    Dense(1, activation = 'sigmoid')
])

In [11]:
model.compile(optimizer = 'Adam',loss = MeanSquaredError())

In [12]:
model.build()

In [None]:
for x in range(0, 1000):
    for data in crash_data:
        model.fit(data['x_t'], data['y_t'], verbose = 2, epochs = 200)

Epoch 1/200
54/54 - 0s - loss: 0.0468
Epoch 2/200
54/54 - 0s - loss: 0.0147
Epoch 3/200
54/54 - 0s - loss: 0.0095
Epoch 4/200
54/54 - 0s - loss: 0.0073
Epoch 5/200
54/54 - 0s - loss: 0.0064
Epoch 6/200
54/54 - 0s - loss: 0.0047
Epoch 7/200
54/54 - 0s - loss: 0.0039
Epoch 8/200
54/54 - 0s - loss: 0.0037
Epoch 9/200
54/54 - 0s - loss: 0.0035
Epoch 10/200
54/54 - 0s - loss: 0.0030
Epoch 11/200
54/54 - 0s - loss: 0.0030
Epoch 12/200
54/54 - 0s - loss: 0.0028
Epoch 13/200
54/54 - 0s - loss: 0.0028
Epoch 14/200
54/54 - 0s - loss: 0.0028
Epoch 15/200
54/54 - 0s - loss: 0.0029
Epoch 16/200
54/54 - 0s - loss: 0.0026
Epoch 17/200
54/54 - 0s - loss: 0.0026
Epoch 18/200
54/54 - 0s - loss: 0.0024
Epoch 19/200
54/54 - 0s - loss: 0.0025
Epoch 20/200
54/54 - 0s - loss: 0.0024
Epoch 21/200
54/54 - 0s - loss: 0.0024
Epoch 22/200
54/54 - 0s - loss: 0.0024
Epoch 23/200
54/54 - 0s - loss: 0.0025
Epoch 24/200
54/54 - 0s - loss: 0.0024
Epoch 25/200
54/54 - 0s - loss: 0.0024
Epoch 26/200
54/54 - 0s - loss: 0.

So what should the price be today?

# Using this model:
    Originally this model worked durring the crash, however, I just redid it to make the code look a little nicer. 
    As this model aimed to predict the V, the best move would likely be to run the model over the days when the V became noticeable, 
    that is, 3-17-2020 through 3-30-2020

In [None]:
#We already pulled this data from above

In [None]:
test_x = test_data['x_t']
test_y = test_data['y_t']

In [None]:
pred = model.predict(test_x)

In [None]:
plt.figure()
plt.plot(pred)
plt.plot(test_y)
plt.legend(['Prediction', 'Actual'])
plt.xlabel('Price (scaled on 0-1 for better training)')
plt.ylabel('Day')

# Pretty close, enough to predict that there would be a  V


Using this notebook durring the crash allowed me to figure out when the market was about to rebound