In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from datetime import datetime, timedelta

In [2]:
from scipy import stats
from scipy.stats import norm

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [3]:
# Customize matplotlib default settings
matplotlib.rcParams.update({'font.size': 16})
plt.rcParams["figure.figsize"] = (20,10)

In [4]:
# plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

## 1. Get stock data

In [5]:
! ls ../input

google_stock_data.csv


In [6]:
# read in the stock data
stock_data = pd.read_csv("../input/google_stock_data.csv")

stock_data.head()

Unnamed: 0,date,open,high,low,close,volume
0,2004-08-19,100.01,104.06,95.96,100.335,44659000.0
1,2004-08-20,101.01,109.08,100.5,108.31,22834300.0
2,2004-08-23,110.76,113.48,109.05,109.4,18256100.0
3,2004-08-24,111.24,111.6,103.57,104.87,15247300.0
4,2004-08-25,104.76,108.0,103.88,106.0,9188600.0


Alpha Vantage metadata for the dataset above:  
{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',
 '2. Symbol': 'GOOGL',
 '3. Last Refreshed': '2019-04-17 16:00:01',
 '4. Output Size': 'Full size',
 '5. Time Zone': 'US/Eastern'}

## 2. Data exploration

In [7]:
stock_data.describe(include='all')

Unnamed: 0,date,open,high,low,close,volume
count,3691,3691.0,3691.0,3691.0,3691.0,3691.0
unique,3691,,,,,
top,2018-10-15,,,,,
freq,1,,,,,
mean,,635.167468,641.086776,628.712554,634.970676,7387645.0
std,,259.306949,260.807566,257.59584,259.319778,8095545.0
min,,99.09,101.74,95.96,100.01,38459.0
25%,,469.795,474.105,464.68,469.975,2091950.0
50%,,580.0,585.0,574.19,580.11,4645500.0
75%,,794.975,799.955,788.355,793.485,9610600.0


In [8]:
print("Date range:", stock_data.date.min(), "to", stock_data.date.max())

Date range: 2004-08-19 to 2019-04-17


Let's visualize the various price measures over time.

In [9]:
stock_data.set_index("date", inplace=True)
# stock_data.head()

In [10]:
marker_size = 3
price_colors = dict(high = "red", low = "blue", open = "#cc33ff", close = "#8000ff")

def make_stock_price_trace(price_type):
    trace = go.Scatter(
                        x = stock_data.index,
                        y = stock_data[price_type],
                        mode = "markers",
                        name = "<b><span style='color:"+ price_colors[price_type] +"'>" + price_type + "</span></b>",
                        marker = dict(color = price_colors[price_type], size = marker_size),
                        text= stock_data.index)
    return trace

layout = dict(title = 'GOOGL stock daily prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Price'),
              autosize=False,
              width=980,
              height=600
             )

traces = []
for p in price_colors.keys():
    traces.append(make_stock_price_trace(p))

fig = dict(data = traces, layout = layout)
iplot(fig)

The high, low, open and close prices follow each other pretty closely. We'll pick just one of them, the close price, and focus on it in further analyses. 

Another dimension to the data is the daily trading volume. Let's visualize it together with the close price, so that we can see how both volume and price of Google stock have changed over time. 
In the chart below, the grey circle sizes reflect the relative daily trading volume size. The circles are transparent, so that when they overlap on the chart they form darker grey/black blobs, which highlight periods of consistently high trading volumes.

In [11]:
trace_bubble = go.Scatter(
                            x = stock_data.index,
                            y = stock_data.close,
                            mode = "markers",
                            name = "volume",
                            marker = dict(size = stock_data.volume, 
                                          sizemode='area',
                                          sizeref=2.*max(stock_data.volume)/(30.**2),
                                          sizemin=1,
                                          color = 'black',
                                          opacity = 0.2,
                                          line = {"width": 0}),
                            hovertext  = "close: " + stock_data.close.astype(str) 
                                        + "<br>volume: " + (stock_data.volume/1000000).round(2).astype(str) + "M"
                                        + "<br>date: " + stock_data.index,
                            hoverinfo = 'text'
                        )


layout = dict(title = 'GOOGL stock daily close prices and volumes' 
                    + '<br><br><span style="font-size: 16px; color: darkgrey">Bubble size = Daily volume</span>',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Price'),
              autosize=False,
              width=980,
              height=700
             )
fig = dict(data = [trace_bubble], layout = layout)
iplot(fig)

In the chart above, we can see a bird's-eye view of the Google stock price history. In the early days of Google, its stock traded at relatively low prices and high volumes. In 2008-2009 we can see a large dip in price, likely explained by the Great Recession. After that the stock price quckly recovered, but the trading volume gradually decreased.

Then something unusual happened early in 2014 to the stock's price: it instantly dropped by half. On April 3rd 2014, the Alphabet (GOOGL) stock split in a 1998 to 1000 ratio. After the split, for each 1000 shares of GOOGL stock owned pre-split, a shareholder now owned 1998 shares. This is very close to a 2-to-1 split ratio, so we'll use it as an approximation in the analyses here. So, for each 1 share of Google stock owned pre-split, a shareholder now owned 2 shares. ([Source](https://www.stocksplithistory.com/alphabet/))

The Google stock split was first announced in early 2012 ([source](https://www.fool.com/investing/2017/05/10/alphabet-stock-split-will-the-google-parent-ever-s.aspx)), so the market had plenty of time to adjust and incorporate the news about the split into the stock price. 

Stock splits can be done for a variety of reasons. One popular reason is to keep the stock price low, allowing for greater liquidity (i.e. more people are able to buy and trade the stock at the lower prices). In this case, since greater liquidity is a desireable quality of a stock, especially for smaller companies, the market may react to the stock split news with a short-term price rally for the stock. But the Google stock split was allegedly done for a different reason. A class action lawsuit argued that it was done to allow Google's top executives to retain majority voting while selling off some of their stock ([source](https://www.fool.com/investing/2017/05/10/alphabet-stock-split-will-the-google-parent-ever-s.aspx)). Given the large size of the company, it's unlikely that the Google stock split has significantly affected the stock price movements in the long-run. We can treat the stock price drop on the day of the split as an outlier, and exclude it from the data by dividing the pre-split prices by two.


**Let's adjust the close prices for the stock split in April 3rd, 2014**  

First, let's take a look at what the stock split looks like in the raw data.

In [12]:
#stock_data.head()
# The split for GOOGL took place on April 03, 2014. (source: https://www.stocksplithistory.com/alphabet/)
stock_data.loc[stock_data.index < '2014-04-05'].tail()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-03-31,1130.4,1135.0,1112.85,1114.51,3880700.0
2014-04-01,1120.27,1137.5,1118.0,1134.89,2181000.0
2014-04-02,1141.9,1144.8,1124.0,1135.1,4168000.0
2014-04-03,573.39,588.3,566.01,571.5,4018300.0
2014-04-04,578.55,579.7173,544.494,545.25,5363700.0


In [13]:
# Adjust the pre-split close price
stock_data["close_split_adj"] = stock_data.close
stock_data["close_split_adj"].loc[stock_data.index < '2014-04-03'] = stock_data.close / 2

# Adjust the pre-split daily trading volume
stock_data["volume_split_adj"] = stock_data.volume
stock_data["volume_split_adj"].loc[stock_data.index < '2014-04-03'] = stock_data.volume * 2

In [14]:
# Verify the adjusted values
stock_data.loc[stock_data.index < '2014-04-05'].tail()

Unnamed: 0_level_0,open,high,low,close,volume,close_split_adj,volume_split_adj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-03-31,1130.4,1135.0,1112.85,1114.51,3880700.0,557.255,7761400.0
2014-04-01,1120.27,1137.5,1118.0,1134.89,2181000.0,567.445,4362000.0
2014-04-02,1141.9,1144.8,1124.0,1135.1,4168000.0,567.55,8336000.0
2014-04-03,573.39,588.3,566.01,571.5,4018300.0,571.5,4018300.0
2014-04-04,578.55,579.7173,544.494,545.25,5363700.0,545.25,5363700.0


Let's plot the adjusted and unadjusted close prices.

In [15]:
trace_close = go.Scatter(
                    x = stock_data.index,
                    y = stock_data.close,
                    mode = "lines",
                    name = "close",
                    opacity = 0.5,
                    marker = dict(color = 'red'),
                    text= stock_data.index)

trace_close_split_adj = go.Scatter(
                    x = stock_data.index,
                    y = stock_data.close_split_adj,
                    mode = "lines",
                    name = "split-adjusted close",
                    opacity = 0.5,
                    marker = dict(color = 'blue'),
                    text= stock_data.index)


layout = dict(title = 'GOOGL stock daily close prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_close, trace_close_split_adj], layout = layout)
iplot(fig)

With the split adjustment, we can see a fairly steady increasing trend in the Google stock price after the Great Recession.

In [16]:
trace_bubble = go.Scatter(
                            x = stock_data.index,
                            y = stock_data.close_split_adj,
                            mode = "markers",
                            name = "volume",
                            marker = dict(size = stock_data.volume_split_adj, 
                                          sizemode='area',
                                          sizeref=2.*max(stock_data.volume_split_adj)/(30.**2),
                                          sizemin=1,
                                          color = 'black',
                                          opacity = 0.2,
                                          line = {"width": 0}),
                            hovertext  = "close: " + stock_data.close_split_adj.astype(str) 
                                        + "<br>volume: " + (stock_data.volume_split_adj/1000000).round(2).astype(str) + "M"
                                        + "<br>date: " + stock_data.index,
                            hoverinfo = 'text'
                        )


layout = dict(title = 'GOOGL stock daily close prices and volumes, split-adjusted' 
                    + '<br><br><span style="font-size: 16px; color: darkgrey">Bubble size = Daily volume</span>',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Price'),
              autosize=False,
              width=980,
              height=700
             )
fig = dict(data = [trace_bubble], layout = layout)
iplot(fig)

To get the best stock close price predictions, we would probably want to use the close prices adjusted for the stock split. But it would also be interesting to see the impact of the known outlier on model performances.  

## 3. Framing the problem  

The point of stock price prediction is make (or at least avoid losing) money. So, to make the stock price analyses a bit more concrete here, let's frame the stock price prediction as an investment problem.  

**Here's a very simple investment problem:**  
Let's say I get a monthly paycheck on the 1st of the month, from which I have \$100 to invest into Google stock each month. And let's say that I happen to think that Google stock is a good long-term investment, and I am not interested in short-term trading of the stock. I just want to regularly invest the $100 in Google stock, but I also know that the stock market is volatile, so I want to avoid buying the stock at peak prices.

I can invest the \$100 into Google stock at any point during the month, or not invest at all. If I know ahead of time that a given month will be bad for buying Google stock, then I can buy a 30-day CD ([Certificate of Deposit](https://www.nerdwallet.com/blog/banking/cd-certificate-of-deposit/)) instead, which will give me a return rate of 0.1% (roughly equvalent to 1.2% APY), or 10 cents return on the $100 after 30 days. It's a very small return, but it's guaranteed that I will get it and will not lose the money. A 30-day CD is like a savings account, but you can't withdraw the money for 30 days. If I buy a 30-day CD, then after the 30 days I can use that money to again either buy Google stock or buy a CD again.

Given my investment budget, preferences and options, I need to come up with an investment strategy. At the first of the month, I need to decide whether I want to buy Google stock at some point that month, or buy the safer 30-day CD instead. If I decide to buy Google stock that month, I want to pick the best day for buying it during the month, and I can update my preferred buying day as new information comes in until I actually buy the stock.

**Approach:**  
To devise the investment strategy, I'll need a forecast of Google stock price movements for the upcoming month. If the forecast says at the first of the month that the stock price will fall significantly over the month, then I will avoid investing in Google that month and buy the 30-day CD instead. Otherwise, I'll look for the day with the lowest forcasted price and plan to buy the stock on that date. Until I actually buy the stock, I will update my forecast everyday to see if I should change my preferred buying date.  

To measure the performance of my forecast and my investment strategy, I will use a simple moving average as a baseline, and compare ARIMA and LSTM models as my potential forecast models of choice. To keep things simple, I'll stick to univariate models of the Google stock adjusted close price time series data.

We'll use the data prior to January 2018 as the training dataset, and the data from January 2018 and onwards as the test dataset.



## 4. Baseline: a simple moving average  

A baseline model gives us something to compare the performance of our models of interest to.  
For a baseline, we'll use a simple moving average of stock prices over the prior N days to predict tomorrow's price:  
y(t) = ( y(t-1) + y(t-2) + ... + y(t-N) ) / N

In [17]:
adj_close = stock_data.close_split_adj.copy()
baseline_test =  stock_data.close_split_adj.loc[stock_data.index >= '2018-01-01'].copy()

In [18]:
adj_close.loc[stock_data.index < '2018-01-05'].tail()

date
2017-12-28    1055.95
2017-12-29    1053.40
2018-01-02    1073.21
2018-01-03    1091.52
2018-01-04    1095.76
Name: close_split_adj, dtype: float64

In [19]:
adj_close.describe()

count    3691.000000
mean      460.684594
std       311.562335
min        50.005000
25%       234.987500
50%       308.890000
75%       656.145000
max      1285.500000
Name: close_split_adj, dtype: float64

In [20]:
baseline_test.describe()

count     325.000000
mean     1127.095538
std        67.050075
min       984.670000
25%      1075.920000
50%      1119.200000
75%      1179.560000
max      1285.500000
Name: close_split_adj, dtype: float64

In [21]:
moving_average = adj_close.rolling(window=20).mean()

In [22]:
trace_MA = go.Scatter(
                    x = moving_average.index,
                    y = moving_average.values,
                    mode = "lines",
                    name = "Moving average",
                    opacity = 0.5,
                    marker = dict(color = 'red'),
                    text= stock_data.index)

trace_adj_close = go.Scatter(
                    x = adj_close.index,
                    y = adj_close.values,
                    mode = "markers",
                    name = "Adjusted close",
                    opacity = 0.5,
                    marker = dict(color = 'blue', size = 3),
                    text= stock_data.index)


layout = dict(title = 'GOOGL stock daily close price moving average over 20-day window',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_MA, trace_adj_close], layout = layout)
iplot(fig)

In [23]:
trace_MA_test = go.Scatter(
                    x = moving_average.loc[moving_average.index >= '2018-01-01'].index,
                    y = moving_average.loc[moving_average.index >= '2018-01-01'].values,
                    mode = "lines",
                    name = "Moving average",
                    opacity = 0.5,
                    marker = dict(color = 'red'),
                    text= stock_data.index)

trace_adj_close_test = go.Scatter(
                    x = baseline_test.index,
                    y = baseline_test.values,
                    mode = "markers",
                    name = "Adjusted close",
                    opacity = 0.5,
                    marker = dict(color = 'blue', size = 3),
                    text= stock_data.index)


layout = dict(title = 'GOOGL stock daily close price moving average over 20-day window',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_MA_test, trace_adj_close_test], layout = layout)
iplot(fig)

In [24]:
recursive_multistep_MA = []

for i in range(len(baseline_test)):
    dt = datetime.strptime(baseline_test.index[i], '%Y-%m-%d')
    curr_row_month = dt.month
    curr_row_year = dt.year
    
    if i == 0:
        day_in_last_month = dt - timedelta(20)
        last_row_month = day_in_last_month.month
        last_row_year = day_in_last_month.year
        
    if last_row_month != curr_row_month:
        # restart the recursive series window
        last_month_start_str = f'{last_row_year}-{last_row_month:02}-01'
        curr_month_start_str = f'{curr_row_year}-{curr_row_month:02}-01'
        recursive_series_window = adj_close.loc[(adj_close.index >= last_month_start_str) 
                                                & (adj_close.index < curr_month_start_str)].values.tolist()

    last_row_month = curr_row_month
    last_row_year = curr_row_year   
        
    
    # make a forecast and store it
    rolling_avg = np.mean(recursive_series_window)
    recursive_multistep_MA.append(rolling_avg)

    # update the moving window with the forecasted value
    recursive_series_window.pop(0)
    recursive_series_window.append(rolling_avg)

In [25]:
trace_multistepMA_test = go.Scatter(
                    x = baseline_test.index,
                    y = recursive_multistep_MA,
                    mode = "markers",
                    name = "Multistep moving average",
                    opacity = 0.5,
                    marker = dict(color = 'green'),
                    text= stock_data.index)

trace_MA_test = go.Scatter(
                    x = moving_average.loc[moving_average.index >= '2018-01-01'].index,
                    y = moving_average.loc[moving_average.index >= '2018-01-01'].values,
                    mode = "lines",
                    name = "Moving average",
                    opacity = 0.5,
                    marker = dict(color = 'red'),
                    text= stock_data.index)

trace_adj_close_test = go.Scatter(
                    x = baseline_test.index,
                    y = baseline_test.values,
                    mode = "markers",
                    name = "Adjusted close",
                    opacity = 0.5,
                    marker = dict(color = 'blue', size = 3),
                    text= stock_data.index)


layout = dict(title = 'GOOGL stock daily close price moving average over 20-day window',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_MA_test, trace_adj_close_test, trace_multistepMA_test], layout = layout)
iplot(fig)

In [28]:

def make_multistep_MA_forecast(updated_days=0):
    recursive_multistep_MA = []

    for i in range(len(baseline_test)):
        dt = datetime.strptime(baseline_test.index[i], '%Y-%m-%d')
        curr_row_month = dt.month
        curr_row_year = dt.year

        if i == 0:
            day_in_last_month = dt - timedelta(20)
            last_row_month = day_in_last_month.month
            last_row_year = day_in_last_month.year

        if last_row_month != curr_row_month:
            # restart the recursive series window
            last_month_start_str = f'{last_row_year}-{last_row_month:02}-01'
            curr_month_start_str = f'{curr_row_year}-{curr_row_month:02}-01'
            recursive_series_window = adj_close.loc[(adj_close.index >= last_month_start_str) 
                                                    & (adj_close.index < curr_month_start_str)].values.tolist()
            j = 0

        last_row_month = curr_row_month
        last_row_year = curr_row_year   


        # make a forecast and store it
        rolling_avg = np.mean(recursive_series_window)
        recursive_multistep_MA.append(rolling_avg)

        # update the moving window with the forecasted value
        recursive_series_window.pop(0)
        
        if j >= updated_days:
            recursive_series_window.append(rolling_avg)
        else:
            recursive_series_window.append(baseline_test.values[i])
        
        j += 1
        
        
    return recursive_multistep_MA
    

In [42]:
test = make_multistep_MA_forecast(updated_days=20)

In [43]:
trace_multistepMA_test_2 = go.Scatter(
                    x = baseline_test.index,
                    y = test,
                    mode = "markers",
                    name = "Multistep moving average 2",
                    opacity = 0.5,
                    marker = dict(color = 'magenta'),
                    text= baseline_test.index)

trace_multistepMA_test = go.Scatter(
                    x = baseline_test.index,
                    y = recursive_multistep_MA,
                    mode = "markers",
                    name = "Multistep moving average",
                    opacity = 0.5,
                    marker = dict(color = 'green'),
                    text= baseline_test.index)

trace_MA_test = go.Scatter(
                    x = moving_average.loc[moving_average.index >= '2018-01-01'].index,
                    y = moving_average.loc[moving_average.index >= '2018-01-01'].values,
                    mode = "lines",
                    name = "Moving average",
                    opacity = 0.5,
                    marker = dict(color = 'red'),
                    text= baseline_test.index)

trace_adj_close_test = go.Scatter(
                    x = baseline_test.index,
                    y = baseline_test.values,
                    mode = "markers",
                    name = "Adjusted close",
                    opacity = 0.5,
                    marker = dict(color = 'blue', size = 3),
                    text= baseline_test.index)


layout = dict(title = 'GOOGL stock daily close price moving average over 20-day window',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600,
              legend=dict(orientation="h")
             )
fig = dict(data = [trace_MA_test, trace_adj_close_test, trace_multistepMA_test, trace_multistepMA_test_2], layout = layout)
iplot(fig)

## 5. Machine learning with LSTM

In [None]:
stock_data.sort_index(inplace=True)

In [None]:
stock_data.head(20)

In [None]:
close_prices = stock_data.reset_index().close.copy()
close_prices.head()

#### Split the close price time series into windows

In [None]:
window_length = 30
windows = []
for i in range(len(close_prices) - (window_length)):
    window_slice = close_prices[i: i + window_length].copy().reset_index(drop=True)
    windows.append(window_slice)

In [None]:
len(windows)

In [None]:
len(close_prices)

#### Normalize the windows

In [None]:
normalized_windows = []
for window in windows:
    normalized_window = [( ( j - window[0] )/ window[0] ) for j in window]
    normalized_windows.append(normalized_window)
        

#### Split the data into train and test sets

In [None]:
normalized_windows = np.array(normalized_windows)

In [None]:
normalized_windows.shape

In [None]:
# Setting a 90%/10% train/test split
ntrainrows = round(len(normalized_windows) * 0.9)

train = normalized_windows[:ntrainrows, :]
np.random.shuffle(train)
x_train = train[:, :-1]
y_train = train[:, -1]

x_test = normalized_windows[ntrainrows:, :-1]
y_test = normalized_windows[ntrainrows:, -1]

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) 

In [None]:
len(normalized_windows) - ntrainrows

#### Build an LSTM model

In [None]:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

In [None]:
# The Sequential model is a linear stack of layers.
# We can create it layer by layer.
model = Sequential()


model.add(LSTM(
    input_dim=1,
    output_dim=30,
    return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
    100,
    return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(
    output_dim=1))

model.add(Activation('linear'))

model.compile(loss='mse', optimizer='rmsprop')

#### Train the model

In [None]:
model.fit(
    x_train,
    y_train,
    batch_size=512,
    nb_epoch=1,
    validation_split=0.05)

#### Get model predictions

In [None]:
window_size = 30 
prediction_len = window_size 


prediction_seqs = []
n_prediction_seqs = round(len(x_test)/prediction_len)

for i in range(n_prediction_seqs):
    curr_frame = x_test[i*prediction_len]
    predicted = []
    for j in range(prediction_len):
        predict_next_day = model.predict(curr_frame[np.newaxis,:,:])[0,0]
        predicted.append(predict_next_day)
        
        curr_frame = curr_frame[1:]
        curr_frame = np.append(curr_frame, [[predict_next_day]], axis=0)
        
    prediction_seqs.append(predicted)

In [None]:
y_test.shape

In [None]:
# leftover days after the last prediction interval
y_test.shape[0] % n_prediction_seqs

#### Visualize the predictions

In [None]:
dates_test = stock_data.index[-len(y_test):]

# Predicted normalized y values
y_predicted_normalized = []
for seq in prediction_seqs:
    for p in seq:
        y_predicted_normalized.append(p)

In [None]:
len(prediction_seqs)

In [None]:
# Normalized test data (observed y values)
trace_close_norm = go.Scatter(
                    x = dates_test,
                    y = y_test,
                    mode = "markers",
                    name = "observed",
                    marker = dict(color = 'grey'),
                    text = dates_test )

trace_predicted_norm = go.Scatter(
                    x = dates_test,
                    y = y_predicted_normalized,
                    mode = "markers",
                    name = "predicted",
                    marker = dict(color = 'blue'),
                    text = dates_test )


layout = dict(title = 'Normalized 30-day window changes in GOOGL Stock Daily Prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price changes'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_close_norm, trace_predicted_norm], layout = layout)
iplot(fig)

In [None]:
x_test.shape

In [None]:
leftover_days = y_test.shape[0] % n_prediction_seqs

windows_test = np.array(windows)[ntrainrows:-leftover_days, :-1]
x_test_starts = [] 
for i in range(len(windows_test)):
    x_test_starts.append(windows_test[i][0])

In [None]:
len(x_test_starts)

In [None]:
len(y_predicted_normalized)

In [None]:
# Denormalized predicted y values
y_predicted_denormalized = []
for i in range(len(y_predicted_normalized)):
    denorm = (y_predicted_normalized[i] + 1) * x_test_starts[i]
    y_predicted_denormalized.append(denorm)

In [None]:
y_predicted_denormalized[:10]

In [None]:
# Normalized test data (observed y values)
trace_close_test = go.Scatter(
                    x = dates_test,
                    y = stock_data.close[-len(y_test):],
                    mode = "lines",
                    name = "observed",
                    marker = dict(color = 'grey'),
                    text = dates_test )

trace_predicted_denorm = go.Scatter(
                    x = dates_test,
                    y = y_predicted_denormalized,
                    mode = "lines",
                    name = "predicted",
                    marker = dict(color = 'blue'),
                    text = dates_test )


layout = dict(title = 'Predicted and observed GOOGL Stock Daily Prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_close_test, trace_predicted_denorm], layout = layout)
iplot(fig)

#### Model performance evaluation

In [None]:
# Mean squared error of the predicted normalized movements of the daily close price
mse_norm = np.mean((y_test[:-6] - y_predicted_normalized)**2)
mse_norm

In [None]:
# Mean squared error of the predicted daily close price
mse_denorm = np.mean((stock_data.close[-len(y_test):-6] - y_predicted_denormalized)**2)
mse_denorm

In [None]:
# use the split-adjusted close price
close_split_adj_prices = stock_data.reset_index().close_split_adj.copy()

In [None]:
close_split_adj_prices.head()

In [None]:
close_prices_split_adj = stock_data.reset_index().close_split_adj.copy()

In [None]:
window_length = 30
windows = []
for i in range(len(close_prices_split_adj) - (window_length)):
    window_slice = close_prices_split_adj[i: i + window_length].copy().reset_index(drop=True)
    windows.append(window_slice)

In [None]:
# normalize the windows
normalized_windows = []
for window in windows:
    normalized_window = [( ( j - window[0] )/ window[0] ) for j in window]
    normalized_windows.append(normalized_window)

In [None]:
# split the data into train and test 
normalized_windows = np.array(normalized_windows)


# Setting a 90%/10% train/test split
ntrainrows = round(len(normalized_windows) * 0.9)

train = normalized_windows[:ntrainrows, :]
np.random.shuffle(train)
x_train = train[:, :-1]
y_train = train[:, -1]

x_test = normalized_windows[ntrainrows:, :-1]
y_test = normalized_windows[ntrainrows:, -1]

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) 

In [None]:
# The Sequential model is a linear stack of layers.
# We can create it layer by layer.
model2 = Sequential()


model2.add(LSTM(
    input_dim=1,
    output_dim=30,
    return_sequences=True))
model2.add(Dropout(0.2))

model2.add(LSTM(
    100,
    return_sequences=False))
model2.add(Dropout(0.2))

model2.add(Dense(
    output_dim=1))

model2.add(Activation('linear'))

model2.compile(loss='mse', optimizer='rmsprop')

In [None]:
# train the model
model2.fit(
    x_train,
    y_train,
    batch_size=512,
    nb_epoch=1,
    validation_split=0.05)

In [None]:
# get predictions 

window_size = 30 
prediction_len = window_size 


prediction_seqs = []
n_prediction_seqs = round(len(x_test)/prediction_len)

for i in range(n_prediction_seqs):
    curr_frame = x_test[i*prediction_len]
    predicted = []
    for j in range(prediction_len):
        predict_next_day = model.predict(curr_frame[np.newaxis,:,:])[0,0]
        predicted.append(predict_next_day)
        
        curr_frame = curr_frame[1:]
        curr_frame = np.append(curr_frame, [[predict_next_day]], axis=0)
        
    prediction_seqs.append(predicted)

In [None]:
# visualize predictions
dates_test = stock_data.index[-len(y_test):]

# Predicted normalized y values
y_predicted_normalized = []
for seq in prediction_seqs:
    for p in seq:
        y_predicted_normalized.append(p)

In [None]:
# Normalized test data (observed y values)
trace_close_norm = go.Scatter(
                    x = dates_test,
                    y = y_test,
                    mode = "markers",
                    name = "observed",
                    marker = dict(color = 'grey'),
                    text = dates_test )

trace_predicted_norm = go.Scatter(
                    x = dates_test,
                    y = y_predicted_normalized,
                    mode = "markers",
                    name = "predicted",
                    marker = dict(color = 'blue'),
                    text = dates_test )


layout = dict(title = 'Normalized 30-day window changes in GOOGL Stock Daily Prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price changes'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_close_norm, trace_predicted_norm], layout = layout)
iplot(fig)

In [None]:
leftover_days = y_test.shape[0] % n_prediction_seqs

windows_test = np.array(windows)[ntrainrows:-leftover_days, :-1]
x_test_starts = [] 
for i in range(len(windows_test)):
    x_test_starts.append(windows_test[i][0])

In [None]:
# Denormalized predicted y values
y_predicted_denormalized = []
for i in range(len(y_predicted_normalized)):
    denorm = (y_predicted_normalized[i] + 1) * x_test_starts[i]
    y_predicted_denormalized.append(denorm)

In [None]:
# Normalized test data (observed y values)
trace_close_test = go.Scatter(
                    x = dates_test,
                    y = stock_data.close[-len(y_test):],
                    mode = "lines",
                    name = "observed",
                    marker = dict(color = 'grey'),
                    text = dates_test )

trace_predicted_denorm = go.Scatter(
                    x = dates_test,
                    y = y_predicted_denormalized,
                    mode = "lines",
                    name = "predicted",
                    marker = dict(color = 'blue'),
                    text = dates_test )


layout = dict(title = 'Predicted and observed GOOGL Stock Daily Prices',
              xaxis= dict(zeroline= False),
              yaxis= dict(title= 'Stock price'),
              autosize=False,
              width=980,
              height=600
             )
fig = dict(data = [trace_close_test, trace_predicted_denorm], layout = layout)
iplot(fig)

In [None]:
# Model performance evaluation

# Mean squared error of the predicted normalized movements of the daily close price
mse_norm = np.mean((y_test[:-6] - y_predicted_normalized)**2)
mse_norm

In [None]:
# Mean squared error of the predicted daily close price
mse_denorm = np.mean((stock_data.close[-len(y_test):-6] - y_predicted_denormalized)**2)
mse_denorm

### Comparing to Moving Average model