In [1]:
# Normalise features
# Find the optimal parameters C (inverse regulalisation strengths)
# Find the best features to predict stock movement (up or down)
# Convert probability into absolute values (stock % increase/decrease)
# SVC, RandomForest, MinMax, LongShortTermMemory

%matplotlib inline
%pylab inline

# Nice Formatting within Jupyter Notebook
%matplotlib inline
from IPython.display import display # Allows multiple displays from a single code-cell

import os
import sys
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib
import matplotlib.pyplot as plt
import datetime
import math

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from collections import OrderedDict
from sklearn.metrics import mean_squared_error


Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
pd_raw = pd.read_csv(os.path.join(os.getcwd(), 'datasets', 'example.csv'))

In [3]:
# Convert all values in the dataframe to numbers, and ignore the date string format
pd_processed = pd_raw.iloc[5:-2 , :].reset_index(drop=True).apply(pd.to_numeric, errors='ignore') 
display(pd_processed)

Unnamed: 0,DATE,AMZN,AMZN.1,AMZN.2,AMZN.3,AMZN.4,AMZN.5,AMZN.6,AMZN.7,AMZN.8,...,AMZN.49,AMZN.50,AMZN.51,AMZN.52,AMZN.53,AMZN.54,AMZN.55,AMZN.56,AMZN.57,AMZN.58
0,01/01/2017,,,,,,,,,,...,,,,,,,,,357687.9900,
1,02/01/2017,,,,,,,,,,...,,,,,,,,,357687.9900,
2,03/01/2017,753.67,,,,,,,,,...,,,,,,,,,359500.5900,
3,04/01/2017,757.18,,,,,,,,,...,,,,,,,,,361174.8600,
4,05/01/2017,780.45,,,,,,,,,...,,,,,,,,,372274.6500,
5,06/01/2017,795.99,,,,,,,,,...,,,,,,,,,379687.2300,
6,09/01/2017,796.92,,,,,,,,,...,,,,,,,,,380130.8400,
7,10/01/2017,795.90,,,,,,,,,...,,,,,,,,,379644.3000,
8,11/01/2017,799.02,,,,,,,,,...,,,,,,,,,381132.5400,
9,12/01/2017,813.64,,,,,,,,,...,,,,,,,,,388106.2800,


In [4]:

class Company(object):
    def __init__(self, name, pdframe):
        self.name = name
        self.share_prices = pdframe[name]
        self.converted_dates = self.preprocess_dates_raw(pdframe["DATE"])
        self.indicator_names_dict = {
            name + ".1" : "Common Shares Outstanding",
            name + ".2" : "Avg. Basic Shares Outstanding",
            name + ".3" : "Avg. Diluted Shares Outstanding",
            name + ".4" : "Revenues",
            name + ".5" : "COGS",
            name + ".6" : "SG&A",
            name + ".7" : "R&D",
            name + ".8" : "EBIT",
            name + ".9" : "EBITDA",
            name + ".10" : "Interest expense, net",
            name + ".11" : "Abnormal Gains/Losses",
            name + ".12" : "Income Taxes",
            name + ".13" : "Net Income from Discontinued Op.",
            name + ".14" : "Net Profit",
            name + ".15" : "Dividends",
            name + ".16" : "Cash and Cash Equivalents",
            name + ".17" : "Receivables",
            name + ".18" : "Current Assets",
            name + ".19" : "Net PP&E",
            name + ".20" : "Intangible Assets",
            name + ".21" : "Goodwill",
            name + ".22" : "Total Noncurrent Assets",
            name + ".23" : "Total Assets",
            name + ".24" : "Short term debt",
            name + ".25" : "Accounts Payable",
            name + ".26" : "Current Liabilities",
            name + ".27" : "Long Term Debt",
            name + ".28" : "Total Noncurrent Liabilities",
            name + ".29" : "Total Liabilities",
            name + ".30" : "Preferred Equity",
            name + ".31" : "Share Capital",
            name + ".32" : "Treasury Stock",
            name + ".33" : "Retained Earnings",
            name + ".34" : "Equity Before Minorities",
            name + ".35" : "Minorities",
            name + ".36" : "Total Equity",
            name + ".37" : "Depreciation & Amortisation",
            name + ".38" : "Change in Working Capital",
            name + ".39" : "Cash From Operating Activities",
            name + ".40" : "Net Change in PP&E & Intangibles",
            name + ".41" : "Cash From Investing Activities",
            name + ".42" : "Cash From Financing Activities",
            name + ".43" : "Net Change in Cash",
            name + ".44" : "Free Cash Flow",
            name + ".45" : "Gross Margin",
            name + ".46" : "Operating Margin",
            name + ".47" : "Net Profit Margin",
            name + ".48" : "Return on Equity",
            name + ".49" : "Return on Assets",
            name + ".50" : "Current Ratio",
            name + ".51" : "Liabilities to Equity Ratio",
            name + ".52" : "Debt to Assets Ratio",
            name + ".53" : "EV / EBITDA",
            name + ".54" : "EV / Sales",
            name + ".55" : "Book to Market",
            name + ".56" : "Operating Income / EV",
            name + ".57" : "Market Capitalisation",
            name + ".58" : "Enterprise Value"
        }
        # Note all indicator data are raw, NaN can be filtered at a later stage
        self.indicator_pdframes_dict = {
            name + ".1" : pdframe[name + ".1"],
            name + ".2" : pdframe[name + ".2"],
            name + ".3" : pdframe[name + ".3"],
            name + ".4" : pdframe[name + ".4"],
            name + ".5" : pdframe[name + ".5"],
            name + ".6" : pdframe[name + ".6"],
            name + ".7" : pdframe[name + ".7"],
            name + ".8" : pdframe[name + ".8"],
            name + ".9" : pdframe[name + ".9"],
            name + ".10" : pdframe[name + ".10"],
            name + ".11" : pdframe[name + ".11"],
            name + ".12" : pdframe[name + ".12"],
            name + ".13" : pdframe[name + ".13"],
            name + ".14" : pdframe[name + ".14"],
            name + ".15" : pdframe[name + ".15"],
            name + ".16" : pdframe[name + ".16"],
            name + ".17" : pdframe[name + ".17"],
            name + ".18" : pdframe[name + ".18"],
            name + ".19" : pdframe[name + ".19"],
            name + ".20" : pdframe[name + ".20"],
            name + ".21" : pdframe[name + ".21"],
            name + ".22" : pdframe[name + ".22"],
            name + ".23" : pdframe[name + ".23"],
            name + ".24" : pdframe[name + ".24"],
            name + ".25" : pdframe[name + ".25"],
            name + ".26" : pdframe[name + ".26"],
            name + ".27" : pdframe[name + ".27"],
            name + ".28" : pdframe[name + ".28"],
            name + ".29" : pdframe[name + ".29"],
            name + ".30" : pdframe[name + ".30"],
            name + ".31" : pdframe[name + ".31"],
            name + ".32" : pdframe[name + ".32"],
            name + ".33" : pdframe[name + ".33"],
            name + ".34" : pdframe[name + ".34"],
            name + ".35" : pdframe[name + ".35"],
            name + ".36" : pdframe[name + ".36"],
            name + ".37" : pdframe[name + ".37"],
            name + ".38" : pdframe[name + ".38"],
            name + ".39" : pdframe[name + ".39"],
            name + ".40" : pdframe[name + ".40"],
            name + ".41" : pdframe[name + ".41"],
            name + ".42" : pdframe[name + ".42"],
            name + ".43" : pdframe[name + ".43"],
            name + ".44" : pdframe[name + ".44"],
            name + ".45" : pdframe[name + ".45"],
            name + ".46" : pdframe[name + ".46"],
            name + ".47" : pdframe[name + ".47"],
            name + ".48" : pdframe[name + ".48"],
            name + ".49" : pdframe[name + ".49"],
            name + ".50" : pdframe[name + ".50"],
            name + ".51" : pdframe[name + ".51"],
            name + ".52" : pdframe[name + ".52"],
            name + ".53" : pdframe[name + ".53"],
            name + ".54" : pdframe[name + ".54"],
            name + ".55" : pdframe[name + ".55"],
            name + ".56" : pdframe[name + ".56"],
            name + ".57" : pdframe[name + ".57"],
            name + ".58" : pdframe[name + ".58"]
        }
    
    def preprocess_dates_raw(self, dates_raw):
        return pd.Series(list(map(datetime.datetime.strptime, dates_raw, len(dates_raw)*['%d/%m/%Y'])))
    
    def series_rows_filter_na(self, series, columns, reset_index): 
        if reset_index == True:
            return series.dropna().reset_index(drop=True)
        else:
            return series.dropna()

    def data_frame_rows_filter_na(self, pdframe, columns, reset_index): 
        # filter weekends share prices
        if reset_index == True:
            return pdframe.dropna(subset=[columns]).reset_index(drop=True)
        else:
            return pdframe.dropna(subset=[columns])
    
    def baseline_predict(self, start_date_string=None, end_date_string=None, plot=False):
        # Filter nan values (weekend share prices are nan since they are not trading!
        non_nan_share_prices = self.series_rows_filter_na(series=self.share_prices, columns="AMZN", reset_index=False)
        non_nan_dates = self.converted_dates[non_nan_share_prices.index]
        
        #Persistence Model Forecast, basically, the same share price with 1 date timelag
        # e.g. the predicted share price at time t, is t-1
        predictions = non_nan_share_prices.shift(1)
        
        #display(predictions)
        # Special case of the first value is changed to the zero 
        predictions.set_value(predictions.index[0], non_nan_share_prices[non_nan_share_prices.index[0]])
        #display(predictions)
 
        # When no date parameters are passed
        if start_date_string == None and end_date_string == None:
            x_axis = non_nan_dates
            plt.title(self.name + "\'s share price in available dates")

        else:
            # start_date and end_date should be in the format of "31/12/2018"
            # prediction excludes the first day of the start date
            start_date = self.convert_date_string_to_datetime(start_date_string) + datetime.timedelta(days=1)
            end_date = self.convert_date_string_to_datetime(end_date_string)
            x_axis = non_nan_dates[(non_nan_dates>=start_date) & (non_nan_dates<=end_date)]
            predictions = predictions[x_axis.index]
            plt.title(self.name + "\'s baseline predicted share price between " + start_date_string + " and " + end_date_string)
        
        if plot:
            formatter = matplotlib.dates.DateFormatter('%d/%m/%Y')            
            plt.plot(x_axis, predictions, '-', label="Predicted prices")
            plt.plot(x_axis, self.share_prices[predictions.index], '-', label="Actual prices")
            ax = pyplot.gcf().axes[0] 
            ax.xaxis.set_major_formatter(formatter)
            ax.legend()
            plt.gcf().autofmt_xdate(rotation=25)
            plt.gcf().set_size_inches(15, 10)
            plt.xlabel("Time")
            plt.ylabel("Share Price")
            plt.show()
        return predictions
        
        
    def get_share_prices(self, start_date_string=None, end_date_string=None, start_delay=None):
        # Filter nan values (weekend share prices are nan since they are not trading!)
        non_nan_share_prices = self.series_rows_filter_na(series=self.share_prices, columns="AMZN", reset_index=False)
        non_nan_dates = self.converted_dates[non_nan_share_prices.index]
        
        # When no date parameters are passed
        if start_date_string == None and end_date_string == None:
            return non_nan_share_prices
        else:
            if start_delay != None:
                start_date = self.convert_date_string_to_datetime(start_date_string) + datetime.timedelta(days=start_delay)
            else:
                start_date = self.convert_date_string_to_datetime(start_date_string)
            end_date = self.convert_date_string_to_datetime(end_date_string)
            revelant_dates = non_nan_dates[(non_nan_dates>=start_date) & (non_nan_dates<=end_date)]
            relevant_share_prices = non_nan_share_prices[revelant_dates.index]
            return relevant_share_prices
    
    def convert_date_string_to_datetime(self, date_string):
        date_day, date_month, date_year = date_string.split("/")
        return datetime.datetime(int(date_year), int(date_month), int(date_day), 0, 0)

    def get_company_attribute_indicator_dict(self, indicator):
        attribute_indicator = dict()
        for i in range(1, 58+1):
            attribute_indicator[company_name + "." + str(i)] = indicator[str(i)]
            
    def show_pairplots(self):
        # Visualise the distribution of each attribute with share price
        selected_attributes = sorted(list(self.indicator_pdframes_dict.keys()))
        i = 5
        while i <= len(selected_attributes):
            sns.pairplot(data=pd_processed, x_vars=selected_attributes[i-5:i], y_vars = 'AMZN', size=3)
            i+=5

        #sns.pairplot(data=pd_processed, x_vars=selected_attributes[:10], y_vars = selected_attributes[:10], size=3)
    
    def get_correlation_between_attributes_and_price(self):
        # A) Calculating correlation coefficients between each attribtue and price
        # Get a list of attributes of the dataset except price remove
        
        selected_attributes = sorted(list(self.indicator_pdframes_dict.keys()))
        selected_attributes.remove("AMZN.1")
        selected_attributes.remove("AMZN.2")
        selected_attributes.remove("AMZN.3")
        
        index_of_report_earnings_release = self.series_rows_filter_na(
            self.indicator_pdframes_dict["AMZN.4"], "AMZN.4", reset_index=False).index

        price_correlation = dict() # dictionary to keep track the correlation of attributes with prices
        nan_corr_attributes = list()
        for attribute in selected_attributes:
            corr = np.corrcoef(self.indicator_pdframes_dict[attribute][index_of_report_earnings_release], 
                               self.share_prices[index_of_report_earnings_release])
            if math.isnan(corr[0][1]):
                nan_corr_attributes.append(self.indicator_names_dict[attribute])
            else:
                price_correlation[corr[0][1]] = attribute

        print("NaN Correlation attributes: ", ", ".join(nan_corr_attributes))
        # sort a dict by value and print out the highest correlating attributes
        correlation_rank = list(price_correlation.keys())

        for key in sorted(correlation_rank, reverse=True): # sort from highest to lowest
            print(str(round(key, 3)) + " " + price_correlation[key] + ": " + self.indicator_names_dict[price_correlation[key]])
            





In [5]:
def get_relevant_columns_for_company(name):
    columns = list()
    columns.append("DATE")
    columns.append(name)
    for i in range(1, 58+1):
        columns.append(name + "." +str(i))
    return columns

amzn = Company("AMZN", pd_processed[get_relevant_columns_for_company("AMZN")])
start_date = "01/08/2017"
end_date = "01/09/2017"


In [6]:
#amzn.get_correlation_between_attributes_and_price()
#amzn.show_pairplots()

In [7]:
class CompanyLSTM(Company):
    def __init__(self, name, pdframe):
        Company.__init__(self, name, pdframe)
        self.lstm_model = None
        self.scaler = None
        self.train_raw = None
        self.test_raw = None
    
    # adapted from https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/
    def timeseries_to_supervised(self, data, lag=1):
        df = pd.DataFrame(data)
        columns = [df.shift(i) for i in range(1, lag+1)]
        columns.append(df)
        df = pd.concat(columns, axis=1)
        df.fillna(0, inplace=True)
        #print("Timeseries to supervised")
        #display(df)
        return df
    
    # create a differenced series
    def difference(self, series, source, interval=1):
        diff = list()
        index = series.index
        for i in range(1, len(series)):
            value = series[index[i]] - series[index[i-1]]
            diff.append(value)
        
        # Last item is special case because there is no next value thus the diff is 
        # 1 size shorter than the original test_raw. We fix this by adding an additional item
        if source == "test":
            diff.append(0)
        return pd.Series(diff)
    
    # invert differenced value
    def inverse_difference(self, history, yhat, interval=1):
        #print("interval", interval)
        #display(history)
        return yhat + history.values[-interval]

    # scale train and test data to [-1, 1]
    def scale(self, train, test):
        # fit scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(train)
        # transform train
        train = train.reshape(train.shape[0], train.shape[1])
        train_scaled = scaler.transform(train)
        # transform test
        test = test.reshape(test.shape[0], test.shape[1])
        test_scaled = scaler.transform(test)
        return scaler, train_scaled, test_scaled

    # inverse scaling for a forecasted value
    def invert_scale(self, X, value):
        new_row = [x for x in X] + [value]
        array = numpy.array(new_row)
        array = array.reshape(1, len(array))
        inverted = self.scaler.inverse_transform(array)
        return inverted[0, -1]

    # fit an LSTM network to training data
    def fit_lstm(self, train, batch_size, nb_epoch, neurons):
        X, y = train[:, 0:-1], train[:, -1]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        model = Sequential()
        model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        for i in range(nb_epoch):
            model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
            model.reset_states()
        return model

    # make a one-step forecast
    def forecast_lstm(self, batch_size, X):
        X = X.reshape(1, 1, len(X))
        pred = self.lstm_model.predict(X, batch_size=batch_size)
        return pred[0,0]
    
    def preprocess_train_test_data(self, train_start_date_string, train_end_test_start_date_string, test_end_date_string):
        # transform data to be stationary
        self.train_raw = self.get_share_prices(train_start_date_string, train_end_test_start_date_string)
        train_diff_values = self.difference(self.train_raw, "train", 1)

        # To avoid overlap between training and test data, the datetime is incremented by 1 day
        self.test_raw = self.get_share_prices(train_end_test_start_date_string, test_end_date_string, start_delay=1)
        test_diff_values = self.difference(self.test_raw, "test", 1)
        
        # transform data to be supervised learning
        train_supervised_pd = self.timeseries_to_supervised(train_diff_values, 1)
        train = train_supervised_pd.values

        test_supervised_pd = self.timeseries_to_supervised(test_diff_values, 1)
        test = test_supervised_pd.values
        
        # transform the scale of the data
        scaler, train_scaled, test_scaled = self.scale(train, test)
        self.scaler = scaler
        
        print("size of train_raw data: ", len(self.train_raw))
        display(self.train_raw)
        print("size of diff train data: ", len(train_diff_values))
        display(train_diff_values)
        print("size of supervised train data: ", len(train))
        display(train)
        print("size of supervised train_scaled data: ", len(train_scaled))
        display(train_scaled)
        print("size of test_raw data: ", len(self.test_raw))
        display(self.test_raw)
        print("size of diff test data: ", len(test_diff_values))
        display(test_diff_values)
        print("size of supervised test data: ", len(test))
        display(test)
        print("size of supervised test_scaled data: ", len(test_scaled))
        display(test_scaled)
        
        return train_scaled, test_scaled
        
    def train_lstm_model(self, train_scaled):        
        # fit the model
        print("Fitting the model")
        self.lstm_model = self.fit_lstm(train_scaled, 1, 3000, 4)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
        self.lstm_model.predict(train_reshaped, batch_size=1)
        print("Finished fitting the model")
        
    def predict(self, test_scaled):
        # walk-forward validation on the test data
        predictions = pd.Series()
        test_index = self.test_raw.index.tolist()
        print("Prediction. Test_scaled")
        #predict the fist share price after the last share price in the training data
        #pred = self.forecast_lstm(1, self.train_scaled[i, 0:-1])
        for i in range(len(test_scaled)):
            # make one-step forecast
            X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
            print("X: ", X, "y: ", y)
            pred = self.forecast_lstm(1, X)
            # invert scaling
            pred = self.invert_scale(X, pred)
            # invert differencing
            pred = self.inverse_difference(self.test_raw, pred, len(test_scaled)-i)
            # store forecast
            predictions.set_value(test_index[i], pred)
            
            expected = self.invert_scale(X, y)
            expected = self.inverse_difference(self.test_raw, expected, len(test_scaled)-i)
            
            exp = self.test_raw[test_index[i]]
            print('Predicted=%f, Expected Raw = %f' % (pred, exp))
        display("len(test)", len(test_scaled))
        display("len(predictions)", len(predictions))

        return predictions
    
    def score(self, predictions):
        rmse = sqrt(mean_squared_error(self.test_raw, predictions))
        return rmse
    

In [None]:
amzn_lstm = CompanyLSTM("AMZN", pd_processed[get_relevant_columns_for_company("AMZN")])
start_train_date = "01/01/2017"
end_train_start_test_date = "01/07/2018"
end_test_date = "01/08/2018"

train_scaled, test_scaled = amzn_lstm.preprocess_train_test_data(start_train_date, end_train_start_test_date, end_test_date)
amzn_lstm.train_lstm_model(train_scaled)
predictions = amzn_lstm.predict(test_scaled)

size of train_raw data:  369


2       753.67
3       757.18
4       780.45
5       795.99
6       796.92
7       795.90
8       799.02
9       813.64
10      817.14
12      809.72
13      807.48
14      809.04
15      808.33
16      817.88
17      822.44
18      836.52
19      839.15
20      835.77
23      830.38
24      823.48
25      832.35
26      839.95
27      810.20
29      807.64
30      812.50
31      819.71
32      821.36
33      827.46
36      836.53
37      836.39
        ...   
443    1574.37
444    1585.46
445    1581.40
446    1601.86
447    1603.07
448    1610.15
450    1612.87
451    1624.89
452    1629.62
453    1641.54
456    1665.27
457    1696.35
458    1695.75
459    1689.30
460    1683.99
461    1689.12
462    1698.75
463    1704.86
464    1723.86
465    1715.97
466    1723.79
467    1734.78
468    1750.08
469    1730.22
470    1715.67
472    1663.15
473    1691.09
474    1660.51
475    1701.45
476    1699.80
Name: AMZN, Length: 369, dtype: float64

size of diff train data:  368


0       3.51
1      23.27
2      15.54
3       0.93
4      -1.02
5       3.12
6      14.62
7       3.50
8      -7.42
9      -2.24
10      1.56
11     -0.71
12      9.55
13      4.56
14     14.08
15      2.63
16     -3.38
17     -5.39
18     -6.90
19      8.87
20      7.60
21    -29.75
22     -2.56
23      4.86
24      7.21
25      1.65
26      6.10
27      9.07
28     -0.14
29      6.31
       ...  
338    -7.39
339    11.09
340    -4.06
341    20.46
342     1.21
343     7.08
344     2.72
345    12.02
346     4.73
347    11.92
348    23.73
349    31.08
350    -0.60
351    -6.45
352    -5.31
353     5.13
354     9.63
355     6.11
356    19.00
357    -7.89
358     7.82
359    10.99
360    15.30
361   -19.86
362   -14.55
363   -52.52
364    27.94
365   -30.58
366    40.94
367    -1.65
Length: 368, dtype: float64

size of supervised train data:  368


array([[ 0.0000e+00,  3.5100e+00],
       [ 3.5100e+00,  2.3270e+01],
       [ 2.3270e+01,  1.5540e+01],
       [ 1.5540e+01,  9.3000e-01],
       [ 9.3000e-01, -1.0200e+00],
       [-1.0200e+00,  3.1200e+00],
       [ 3.1200e+00,  1.4620e+01],
       [ 1.4620e+01,  3.5000e+00],
       [ 3.5000e+00, -7.4200e+00],
       [-7.4200e+00, -2.2400e+00],
       [-2.2400e+00,  1.5600e+00],
       [ 1.5600e+00, -7.1000e-01],
       [-7.1000e-01,  9.5500e+00],
       [ 9.5500e+00,  4.5600e+00],
       [ 4.5600e+00,  1.4080e+01],
       [ 1.4080e+01,  2.6300e+00],
       [ 2.6300e+00, -3.3800e+00],
       [-3.3800e+00, -5.3900e+00],
       [-5.3900e+00, -6.9000e+00],
       [-6.9000e+00,  8.8700e+00],
       [ 8.8700e+00,  7.6000e+00],
       [ 7.6000e+00, -2.9750e+01],
       [-2.9750e+01, -2.5600e+00],
       [-2.5600e+00,  4.8600e+00],
       [ 4.8600e+00,  7.2100e+00],
       [ 7.2100e+00,  1.6500e+00],
       [ 1.6500e+00,  6.1000e+00],
       [ 6.1000e+00,  9.0700e+00],
       [ 9.0700e+00,

size of supervised train_scaled data:  368


array([[-0.26080345, -0.22636975],
       [-0.22636975, -0.03252072],
       [-0.03252072, -0.10835336],
       [-0.10835336, -0.25167999],
       [-0.25167999, -0.27080983],
       [-0.27080983, -0.23019571],
       [-0.23019571, -0.11737872],
       [-0.11737872, -0.22646785],
       [-0.22646785, -0.33359494],
       [-0.33359494, -0.28277824],
       [-0.28277824, -0.24549958],
       [-0.24549958, -0.26776868],
       [-0.26776868, -0.1671163 ],
       [-0.1671163 , -0.21606906],
       [-0.21606906, -0.12267622],
       [-0.12267622, -0.2350027 ],
       [-0.2350027 , -0.29396184],
       [-0.29396184, -0.31368029],
       [-0.31368029, -0.32849365],
       [-0.32849365, -0.17378722],
       [-0.17378722, -0.18624614],
       [-0.18624614, -0.5526561 ],
       [-0.5526561 , -0.2859175 ],
       [-0.2859175 , -0.21312601],
       [-0.21312601, -0.1900721 ],
       [-0.1900721 , -0.24461667],
       [-0.24461667, -0.2009614 ],
       [-0.2009614 , -0.17182518],
       [-0.17182518,

size of test_raw data:  22


479    1713.78
480    1693.96
481    1699.73
482    1710.63
483    1739.02
484    1743.07
485    1755.00
486    1796.62
487    1813.03
488    1822.49
489    1843.93
490    1842.92
491    1812.97
492    1813.70
493    1802.00
494    1829.24
495    1863.61
496    1808.00
497    1817.27
500    1779.22
501    1777.44
502    1797.17
Name: AMZN, dtype: float64

size of diff test data:  22


0    -19.82
1      5.77
2     10.90
3     28.39
4      4.05
5     11.93
6     41.62
7     16.41
8      9.46
9     21.44
10    -1.01
11   -29.95
12     0.73
13   -11.70
14    27.24
15    34.37
16   -55.61
17     9.27
18   -38.05
19    -1.78
20    19.73
21     0.00
dtype: float64

size of supervised test data:  22


array([[  0.  , -19.82],
       [-19.82,   5.77],
       [  5.77,  10.9 ],
       [ 10.9 ,  28.39],
       [ 28.39,   4.05],
       [  4.05,  11.93],
       [ 11.93,  41.62],
       [ 41.62,  16.41],
       [ 16.41,   9.46],
       [  9.46,  21.44],
       [ 21.44,  -1.01],
       [ -1.01, -29.95],
       [-29.95,   0.73],
       [  0.73, -11.7 ],
       [-11.7 ,  27.24],
       [ 27.24,  34.37],
       [ 34.37, -55.61],
       [-55.61,   9.27],
       [  9.27, -38.05],
       [-38.05,  -1.78],
       [ -1.78,  19.73],
       [ 19.73,   0.  ]])

size of supervised test_scaled data:  22


array([[-0.26080345, -0.45524109],
       [-0.45524109, -0.20419875],
       [-0.20419875, -0.15387257],
       [-0.15387257,  0.01770736],
       [ 0.01770736, -0.22107225],
       [-0.22107225, -0.14376809],
       [-0.14376809,  0.14749595],
       [ 0.14749595, -0.09981851],
       [-0.09981851, -0.16799922],
       [-0.16799922, -0.05047334],
       [-0.05047334, -0.27071173],
       [-0.27071173, -0.55461814],
       [-0.55461814, -0.25364203],
       [-0.25364203, -0.37558248],
       [-0.37558248,  0.00642566],
       [ 0.00642566,  0.0763722 ],
       [ 0.0763722 , -0.80634718],
       [-0.80634718, -0.16986315],
       [-0.16986315, -0.63408054],
       [-0.63408054, -0.27826556],
       [-0.27826556, -0.06724874],
       [-0.06724874, -0.26080345]])

Fitting the model


In [None]:
# test
#display(amzn_lstm.train_raw)
#display(amzn_lstm.test_raw)
#display(predictions)

# report performance
print('LSTM test RMSE: %.3f' % amzn_lstm.score(predictions))
print('Baseline test RMSE: %.3f' % amzn_lstm.score(amzn_lstm.baseline_predict(end_train_start_test_date, end_test_date)))

# line plot of observed vs predicted

train_index = amzn_lstm.train_raw.index.tolist()
test_index = amzn_lstm.test_raw.index.tolist()

#display(raw_values)
formatter = matplotlib.dates.DateFormatter('%d/%m/%Y')            
plt.plot(amzn.converted_dates[test_index], predictions, ':', marker= 'x', label="Predicted prices")
plt.plot(amzn.converted_dates[set(train_index + test_index)], amzn.get_share_prices(start_train_date, end_test_date), '-', marker= ".", label="Actual prices")
ax = pyplot.gcf().axes[0] 
ax.xaxis.set_major_formatter(formatter)
ax.legend()
plt.gcf().autofmt_xdate(rotation=25)
plt.gcf().set_size_inches(15, 10)
plt.xlabel("Time")
plt.ylabel("Share Price")
plt.show()
