In [1]:
# Normalise features
# Find the optimal parameters C (inverse regulalisation strengths)
# Find the best features to predict stock movement (up or down)
# Convert probability into absolute values (stock % increase/decrease)
# SVC, RandomForest, MinMax, LongShortTermMemory

%matplotlib inline
%pylab inline

# Nice Formatting within Jupyter Notebook
%matplotlib inline
from IPython.display import display # Allows multiple displays from a single code-cell

import os
import sys
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib
import matplotlib.pyplot as plt
import datetime
import math

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from collections import OrderedDict
from sklearn.metrics import mean_squared_error


Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
pd_raw = pd.read_csv(os.path.join(os.getcwd(), 'datasets', 'example.csv'))

In [3]:
# Convert all values in the dataframe to numbers, and ignore the date string format
pd_processed = pd_raw.iloc[5:-2 , :].reset_index(drop=True).apply(pd.to_numeric, errors='ignore') 
display(pd_processed)

Unnamed: 0,DATE,AMZN,AMZN.1,AMZN.2,AMZN.3,AMZN.4,AMZN.5,AMZN.6,AMZN.7,AMZN.8,...,AMZN.49,AMZN.50,AMZN.51,AMZN.52,AMZN.53,AMZN.54,AMZN.55,AMZN.56,AMZN.57,AMZN.58
0,01/01/2017,,,,,,,,,,...,,,,,,,,,357687.9900,
1,02/01/2017,,,,,,,,,,...,,,,,,,,,357687.9900,
2,03/01/2017,753.67,,,,,,,,,...,,,,,,,,,359500.5900,
3,04/01/2017,757.18,,,,,,,,,...,,,,,,,,,361174.8600,
4,05/01/2017,780.45,,,,,,,,,...,,,,,,,,,372274.6500,
5,06/01/2017,795.99,,,,,,,,,...,,,,,,,,,379687.2300,
6,09/01/2017,796.92,,,,,,,,,...,,,,,,,,,380130.8400,
7,10/01/2017,795.90,,,,,,,,,...,,,,,,,,,379644.3000,
8,11/01/2017,799.02,,,,,,,,,...,,,,,,,,,381132.5400,
9,12/01/2017,813.64,,,,,,,,,...,,,,,,,,,388106.2800,


In [4]:

class Company(object):
    def __init__(self, name, pdframe):
        self.name = name
        self.share_prices = pdframe[name]
        self.converted_dates = self.preprocess_dates_raw(pdframe["DATE"])
        self.indicator_names_dict = {
            name + ".1" : "Common Shares Outstanding",
            name + ".2" : "Avg. Basic Shares Outstanding",
            name + ".3" : "Avg. Diluted Shares Outstanding",
            name + ".4" : "Revenues",
            name + ".5" : "COGS",
            name + ".6" : "SG&A",
            name + ".7" : "R&D",
            name + ".8" : "EBIT",
            name + ".9" : "EBITDA",
            name + ".10" : "Interest expense, net",
            name + ".11" : "Abnormal Gains/Losses",
            name + ".12" : "Income Taxes",
            name + ".13" : "Net Income from Discontinued Op.",
            name + ".14" : "Net Profit",
            name + ".15" : "Dividends",
            name + ".16" : "Cash and Cash Equivalents",
            name + ".17" : "Receivables",
            name + ".18" : "Current Assets",
            name + ".19" : "Net PP&E",
            name + ".20" : "Intangible Assets",
            name + ".21" : "Goodwill",
            name + ".22" : "Total Noncurrent Assets",
            name + ".23" : "Total Assets",
            name + ".24" : "Short term debt",
            name + ".25" : "Accounts Payable",
            name + ".26" : "Current Liabilities",
            name + ".27" : "Long Term Debt",
            name + ".28" : "Total Noncurrent Liabilities",
            name + ".29" : "Total Liabilities",
            name + ".30" : "Preferred Equity",
            name + ".31" : "Share Capital",
            name + ".32" : "Treasury Stock",
            name + ".33" : "Retained Earnings",
            name + ".34" : "Equity Before Minorities",
            name + ".35" : "Minorities",
            name + ".36" : "Total Equity",
            name + ".37" : "Depreciation & Amortisation",
            name + ".38" : "Change in Working Capital",
            name + ".39" : "Cash From Operating Activities",
            name + ".40" : "Net Change in PP&E & Intangibles",
            name + ".41" : "Cash From Investing Activities",
            name + ".42" : "Cash From Financing Activities",
            name + ".43" : "Net Change in Cash",
            name + ".44" : "Free Cash Flow",
            name + ".45" : "Gross Margin",
            name + ".46" : "Operating Margin",
            name + ".47" : "Net Profit Margin",
            name + ".48" : "Return on Equity",
            name + ".49" : "Return on Assets",
            name + ".50" : "Current Ratio",
            name + ".51" : "Liabilities to Equity Ratio",
            name + ".52" : "Debt to Assets Ratio",
            name + ".53" : "EV / EBITDA",
            name + ".54" : "EV / Sales",
            name + ".55" : "Book to Market",
            name + ".56" : "Operating Income / EV",
            name + ".57" : "Market Capitalisation",
            name + ".58" : "Enterprise Value"
        }
        # Note all indicator data are raw, NaN can be filtered at a later stage
        self.indicator_pdframes_dict = {
            name + ".1" : pdframe[name + ".1"],
            name + ".2" : pdframe[name + ".2"],
            name + ".3" : pdframe[name + ".3"],
            name + ".4" : pdframe[name + ".4"],
            name + ".5" : pdframe[name + ".5"],
            name + ".6" : pdframe[name + ".6"],
            name + ".7" : pdframe[name + ".7"],
            name + ".8" : pdframe[name + ".8"],
            name + ".9" : pdframe[name + ".9"],
            name + ".10" : pdframe[name + ".10"],
            name + ".11" : pdframe[name + ".11"],
            name + ".12" : pdframe[name + ".12"],
            name + ".13" : pdframe[name + ".13"],
            name + ".14" : pdframe[name + ".14"],
            name + ".15" : pdframe[name + ".15"],
            name + ".16" : pdframe[name + ".16"],
            name + ".17" : pdframe[name + ".17"],
            name + ".18" : pdframe[name + ".18"],
            name + ".19" : pdframe[name + ".19"],
            name + ".20" : pdframe[name + ".20"],
            name + ".21" : pdframe[name + ".21"],
            name + ".22" : pdframe[name + ".22"],
            name + ".23" : pdframe[name + ".23"],
            name + ".24" : pdframe[name + ".24"],
            name + ".25" : pdframe[name + ".25"],
            name + ".26" : pdframe[name + ".26"],
            name + ".27" : pdframe[name + ".27"],
            name + ".28" : pdframe[name + ".28"],
            name + ".29" : pdframe[name + ".29"],
            name + ".30" : pdframe[name + ".30"],
            name + ".31" : pdframe[name + ".31"],
            name + ".32" : pdframe[name + ".32"],
            name + ".33" : pdframe[name + ".33"],
            name + ".34" : pdframe[name + ".34"],
            name + ".35" : pdframe[name + ".35"],
            name + ".36" : pdframe[name + ".36"],
            name + ".37" : pdframe[name + ".37"],
            name + ".38" : pdframe[name + ".38"],
            name + ".39" : pdframe[name + ".39"],
            name + ".40" : pdframe[name + ".40"],
            name + ".41" : pdframe[name + ".41"],
            name + ".42" : pdframe[name + ".42"],
            name + ".43" : pdframe[name + ".43"],
            name + ".44" : pdframe[name + ".44"],
            name + ".45" : pdframe[name + ".45"],
            name + ".46" : pdframe[name + ".46"],
            name + ".47" : pdframe[name + ".47"],
            name + ".48" : pdframe[name + ".48"],
            name + ".49" : pdframe[name + ".49"],
            name + ".50" : pdframe[name + ".50"],
            name + ".51" : pdframe[name + ".51"],
            name + ".52" : pdframe[name + ".52"],
            name + ".53" : pdframe[name + ".53"],
            name + ".54" : pdframe[name + ".54"],
            name + ".55" : pdframe[name + ".55"],
            name + ".56" : pdframe[name + ".56"],
            name + ".57" : pdframe[name + ".57"],
            name + ".58" : pdframe[name + ".58"]
        }
    
    def preprocess_dates_raw(self, dates_raw):
        return pd.Series(list(map(datetime.datetime.strptime, dates_raw, len(dates_raw)*['%d/%m/%Y'])))
    
    def series_rows_filter_na(self, series, columns, reset_index): 
        if reset_index == True:
            return series.dropna().reset_index(drop=True)
        else:
            return series.dropna()

    def data_frame_rows_filter_na(self, pdframe, columns, reset_index): 
        # filter weekends share prices
        if reset_index == True:
            return pdframe.dropna(subset=[columns]).reset_index(drop=True)
        else:
            return pdframe.dropna(subset=[columns])
    
    def baseline_predict(self, start_date_string=None, end_date_string=None, plot=False):
        # Filter nan values (weekend share prices are nan since they are not trading!
        non_nan_share_prices = self.series_rows_filter_na(series=self.share_prices, columns="AMZN", reset_index=False)
        non_nan_dates = self.converted_dates[non_nan_share_prices.index]
        
        #Persistence Model Forecast, basically, the same share price with 1 date timelag
        # e.g. the predicted share price at time t, is t-1
        predictions = non_nan_share_prices.shift(1)
        
        #display(predictions)
        # Special case of the first value is changed to the zero 
        predictions.set_value(predictions.index[0], non_nan_share_prices[non_nan_share_prices.index[0]])
        #display(predictions)
 
        # When no date parameters are passed
        if start_date_string == None and end_date_string == None:
            x_axis = non_nan_dates
            plt.title(self.name + "\'s share price in available dates")

        else:
            # start_date and end_date should be in the format of "31/12/2018"
            # prediction excludes the first day of the start date
            start_date = self.convert_date_string_to_datetime(start_date_string) + datetime.timedelta(days=1)
            end_date = self.convert_date_string_to_datetime(end_date_string)
            x_axis = non_nan_dates[(non_nan_dates>=start_date) & (non_nan_dates<=end_date)]
            predictions = predictions[x_axis.index]
            plt.title(self.name + "\'s baseline predicted share price between " + start_date_string + " and " + end_date_string)
        
        if plot:
            formatter = matplotlib.dates.DateFormatter('%d/%m/%Y')            
            plt.plot(x_axis, predictions, '-', label="Predicted prices")
            plt.plot(x_axis, self.share_prices[predictions.index], '-', label="Actual prices")
            ax = pyplot.gcf().axes[0] 
            ax.xaxis.set_major_formatter(formatter)
            ax.legend()
            plt.gcf().autofmt_xdate(rotation=25)
            plt.gcf().set_size_inches(15, 10)
            plt.xlabel("Time")
            plt.ylabel("Share Price")
            plt.show()
        return predictions
        
        
    def get_share_prices(self, start_date_string=None, end_date_string=None, start_delay=None):
        # Filter nan values (weekend share prices are nan since they are not trading!)
        non_nan_share_prices = self.series_rows_filter_na(series=self.share_prices, columns="AMZN", reset_index=False)
        non_nan_dates = self.converted_dates[non_nan_share_prices.index]
        
        # When no date parameters are passed
        if start_date_string == None and end_date_string == None:
            return non_nan_share_prices
        else:
            if start_delay != None:
                start_date = self.convert_date_string_to_datetime(start_date_string) + datetime.timedelta(days=start_delay)
            else:
                start_date = self.convert_date_string_to_datetime(start_date_string)
            end_date = self.convert_date_string_to_datetime(end_date_string)
            revelant_dates = non_nan_dates[(non_nan_dates>=start_date) & (non_nan_dates<=end_date)]
            relevant_share_prices = non_nan_share_prices[revelant_dates.index]
            return relevant_share_prices
    
    def convert_date_string_to_datetime(self, date_string):
        date_day, date_month, date_year = date_string.split("/")
        return datetime.datetime(int(date_year), int(date_month), int(date_day), 0, 0)

    def get_company_attribute_indicator_dict(self, indicator):
        attribute_indicator = dict()
        for i in range(1, 58+1):
            attribute_indicator[company_name + "." + str(i)] = indicator[str(i)]
            
    def show_pairplots(self):
        # Visualise the distribution of each attribute with share price
        selected_attributes = sorted(list(self.indicator_pdframes_dict.keys()))
        i = 5
        while i <= len(selected_attributes):
            sns.pairplot(data=pd_processed, x_vars=selected_attributes[i-5:i], y_vars = 'AMZN', size=3)
            i+=5

        #sns.pairplot(data=pd_processed, x_vars=selected_attributes[:10], y_vars = selected_attributes[:10], size=3)
    
    def get_correlation_between_attributes_and_price(self):
        # A) Calculating correlation coefficients between each attribtue and price
        # Get a list of attributes of the dataset except price remove
        
        selected_attributes = sorted(list(self.indicator_pdframes_dict.keys()))
        selected_attributes.remove("AMZN.1")
        selected_attributes.remove("AMZN.2")
        selected_attributes.remove("AMZN.3")
        
        index_of_report_earnings_release = self.series_rows_filter_na(
            self.indicator_pdframes_dict["AMZN.4"], "AMZN.4", reset_index=False).index

        price_correlation = dict() # dictionary to keep track the correlation of attributes with prices
        nan_corr_attributes = list()
        for attribute in selected_attributes:
            corr = np.corrcoef(self.indicator_pdframes_dict[attribute][index_of_report_earnings_release], 
                               self.share_prices[index_of_report_earnings_release])
            if math.isnan(corr[0][1]):
                nan_corr_attributes.append(self.indicator_names_dict[attribute])
            else:
                price_correlation[corr[0][1]] = attribute

        print("NaN Correlation attributes: ", ", ".join(nan_corr_attributes))
        # sort a dict by value and print out the highest correlating attributes
        correlation_rank = list(price_correlation.keys())

        for key in sorted(correlation_rank, reverse=True): # sort from highest to lowest
            print(str(round(key, 3)) + " " + price_correlation[key] + ": " + self.indicator_names_dict[price_correlation[key]])
            





In [5]:
def get_relevant_columns_for_company(name):
    columns = list()
    columns.append("DATE")
    columns.append(name)
    for i in range(1, 58+1):
        columns.append(name + "." +str(i))
    return columns

amzn = Company("AMZN", pd_processed[get_relevant_columns_for_company("AMZN")])
start_date = "01/08/2017"
end_date = "01/09/2017"


In [6]:
#amzn.get_correlation_between_attributes_and_price()
#amzn.show_pairplots()

In [None]:
class CompanyLSTM(Company):
    def __init__(self, name, pdframe):
        Company.__init__(self, name, pdframe)
        self.lstm_model = None
        self.scaler = None
        self.train_raw = None
        self.test_raw = None
    
    # adapted from https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/
    def timeseries_to_supervised(self, data, lag=1):
        df = pd.DataFrame(data)
        columns = [df.shift(i) for i in range(1, lag+1)]
        columns.append(df)
        df = pd.concat(columns, axis=1)
        df.fillna(0, inplace=True)
        #print("Timeseries to supervised")
        #display(df)
        return df
    
    # create a differenced series
    def difference(self, series, source, interval=1):
        diff = list()
        index = series.index
        for i in range(1, len(series)):
            value = series[index[i]] - series[index[i-1]]
            diff.append(value)
        
        # Last item is special case because there is no next value thus the diff is 
        # 1 size shorter than the original test_raw. We fix this by adding an additional item
        if source == "test":
            diff.append(0)
        return pd.Series(diff)
    
    # invert differenced value
    def inverse_difference(self, history, yhat, interval=1):
        #print("interval", interval)
        #display(history)
        return yhat + history.values[-interval]

    # scale train and test data to [-1, 1]
    def scale(self, train, test):
        # fit scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(train)
        # transform train
        train = train.reshape(train.shape[0], train.shape[1])
        train_scaled = scaler.transform(train)
        # transform test
        test = test.reshape(test.shape[0], test.shape[1])
        test_scaled = scaler.transform(test)
        return scaler, train_scaled, test_scaled

    # inverse scaling for a forecasted value
    def invert_scale(self, X, value):
        new_row = [x for x in X] + [value]
        array = numpy.array(new_row)
        array = array.reshape(1, len(array))
        inverted = self.scaler.inverse_transform(array)
        return inverted[0, -1]

    # fit an LSTM network to training data
    def fit_lstm(self, train, batch_size, nb_epoch, neurons):
        X, y = train[:, 0:-1], train[:, -1]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        model = Sequential()
        model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        for i in range(nb_epoch):
            model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
            model.reset_states()
        return model

    # make a one-step forecast
    def forecast_lstm(self, batch_size, X):
        X = X.reshape(1, 1, len(X))
        pred = self.lstm_model.predict(X, batch_size=batch_size)
        return pred[0,0]
    
    def preprocess_train_test_data(self, train_start_date_string, train_end_test_start_date_string, test_end_date_string):
        # transform data to be stationary
        self.train_raw = self.get_share_prices(train_start_date_string, train_end_test_start_date_string)
        train_diff_values = self.difference(self.train_raw, "train", 1)

        # To avoid overlap between training and test data, the datetime is incremented by 1 day
        self.test_raw = self.get_share_prices(train_end_test_start_date_string, test_end_date_string, start_delay=1)
        test_diff_values = self.difference(self.test_raw, "test", 1)
        
        # transform data to be supervised learning
        train_supervised_pd = self.timeseries_to_supervised(train_diff_values, 1)
        train = train_supervised_pd.values

        test_supervised_pd = self.timeseries_to_supervised(test_diff_values, 1)
        test = test_supervised_pd.values
        
        # transform the scale of the data
        scaler, train_scaled, test_scaled = self.scale(train, test)
        self.scaler = scaler
        
        print("size of train_raw data: ", len(self.train_raw))
        display(self.train_raw)
        print("size of diff train data: ", len(train_diff_values))
        display(train_diff_values)
        print("size of supervised train data: ", len(train))
        display(train)
        print("size of supervised train_scaled data: ", len(train_scaled))
        display(train_scaled)
        print("size of test_raw data: ", len(self.test_raw))
        display(self.test_raw)
        print("size of diff test data: ", len(test_diff_values))
        display(test_diff_values)
        print("size of supervised test data: ", len(test))
        display(test)
        print("size of supervised test_scaled data: ", len(test_scaled))
        display(test_scaled)
        
        return train_scaled, test_scaled
        
    def train_lstm_model(self, train_scaled):        
        # fit the model
        print("Fitting the model")
        self.lstm_model = self.fit_lstm(train_scaled, 1, 3000, 4)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
        self.lstm_model.predict(train_reshaped, batch_size=1)
        print("Finished fitting the model")
        
    def predict(self, test_scaled):
        # walk-forward validation on the test data
        predictions = pd.Series()
        test_index = self.test_raw.index.tolist()
        print("Prediction. Test_scaled")
        #predict the fist share price after the last share price in the training data
        #pred = self.forecast_lstm(1, self.train_scaled[i, 0:-1])
        for i in range(len(test_scaled)):
            # make one-step forecast
            X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
            print("X: ", X, "y: ", y)
            pred = self.forecast_lstm(1, X)
            # invert scaling
            pred = self.invert_scale(X, pred)
            # invert differencing
            pred = self.inverse_difference(self.test_raw, pred, len(test_scaled)-i)
            # store forecast
            predictions.set_value(test_index[i], pred)
            
            expected = self.invert_scale(X, y)
            expected = self.inverse_difference(self.test_raw, expected, len(test_scaled)-i)
            
            exp = self.test_raw[test_index[i]]
            print('Predicted=%f, Expected Raw = %f' % (pred, exp))
        display("len(test)", len(test_scaled))
        display("len(predictions)", len(predictions))

        return predictions
    
    def score(self, predictions):
        rmse = sqrt(mean_squared_error(self.test_raw, predictions))
        return rmse
    

In [None]:
amzn_lstm = CompanyLSTM("AMZN", pd_processed[get_relevant_columns_for_company("AMZN")])
start_train_date = "01/01/2017"
end_train_start_test_date = "01/07/2018"
end_test_date = "01/08/2018"

train_scaled, test_scaled = amzn_lstm.preprocess_train_test_data(start_train_date, end_train_start_test_date, end_test_date)
amzn_lstm.train_lstm_model(train_scaled)
predictions = amzn_lstm.predict(test_scaled)

size of train_raw data:  125


2       753.67
3       757.18
4       780.45
5       795.99
6       796.92
7       795.90
8       799.02
9       813.64
10      817.14
12      809.72
13      807.48
14      809.04
15      808.33
16      817.88
17      822.44
18      836.52
19      839.15
20      835.77
23      830.38
24      823.48
25      832.35
26      839.95
27      810.20
29      807.64
30      812.50
31      819.71
32      821.36
33      827.46
36      836.53
37      836.39
        ...   
124     959.84
125     970.70
126     971.54
127     980.35
128     993.38
129     995.78
133     996.70
134     994.62
135     995.95
136    1006.73
139    1011.34
140    1002.97
141    1010.07
142    1009.94
143     978.31
144     964.83
145     980.79
146     976.47
147     964.17
148     987.71
151     995.17
152     992.59
153    1002.23
154    1001.30
155    1003.74
158     993.98
159     976.78
160     990.33
161     975.93
162     968.00
Name: AMZN, Length: 125, dtype: float64

size of diff train data:  124


0       3.51
1      23.27
2      15.54
3       0.93
4      -1.02
5       3.12
6      14.62
7       3.50
8      -7.42
9      -2.24
10      1.56
11     -0.71
12      9.55
13      4.56
14     14.08
15      2.63
16     -3.38
17     -5.39
18     -6.90
19      8.87
20      7.60
21    -29.75
22     -2.56
23      4.86
24      7.21
25      1.65
26      6.10
27      9.07
28     -0.14
29      6.31
       ...  
94      1.35
95     10.86
96      0.84
97      8.81
98     13.03
99      2.40
100     0.92
101    -2.08
102     1.33
103    10.78
104     4.61
105    -8.37
106     7.10
107    -0.13
108   -31.63
109   -13.48
110    15.96
111    -4.32
112   -12.30
113    23.54
114     7.46
115    -2.58
116     9.64
117    -0.93
118     2.44
119    -9.76
120   -17.20
121    13.55
122   -14.40
123    -7.93
Length: 124, dtype: float64

size of supervised train data:  124


array([[  0.  ,   3.51],
       [  3.51,  23.27],
       [ 23.27,  15.54],
       [ 15.54,   0.93],
       [  0.93,  -1.02],
       [ -1.02,   3.12],
       [  3.12,  14.62],
       [ 14.62,   3.5 ],
       [  3.5 ,  -7.42],
       [ -7.42,  -2.24],
       [ -2.24,   1.56],
       [  1.56,  -0.71],
       [ -0.71,   9.55],
       [  9.55,   4.56],
       [  4.56,  14.08],
       [ 14.08,   2.63],
       [  2.63,  -3.38],
       [ -3.38,  -5.39],
       [ -5.39,  -6.9 ],
       [ -6.9 ,   8.87],
       [  8.87,   7.6 ],
       [  7.6 , -29.75],
       [-29.75,  -2.56],
       [ -2.56,   4.86],
       [  4.86,   7.21],
       [  7.21,   1.65],
       [  1.65,   6.1 ],
       [  6.1 ,   9.07],
       [  9.07,  -0.14],
       [ -0.14,   6.31],
       [  6.31,   1.44],
       [  1.44,   0.93],
       [  0.93,  11.37],
       [ 11.37,  -0.83],
       [ -0.83,  -3.42],
       [ -3.42,  -6.95],
       [ -6.95,   3.4 ],
       [  3.4 ,  -3.6 ],
       [ -3.6 ,   8.04],
       [  8.04,  -4.17],


size of supervised train_scaled data:  124


array([[ 0.14663767,  0.27388073],
       [ 0.27388073,  0.99021207],
       [ 0.99021207,  0.70998731],
       [ 0.70998731,  0.18035164],
       [ 0.18035164,  0.10966105],
       [ 0.10966105,  0.25974261],
       [ 0.25974261,  0.67663585],
       [ 0.67663585,  0.27351822],
       [ 0.27351822, -0.1223491 ],
       [-0.1223491 ,  0.06543411],
       [ 0.06543411,  0.20319014],
       [ 0.20319014,  0.12089904],
       [ 0.12089904,  0.49284031],
       [ 0.49284031,  0.3119449 ],
       [ 0.3119449 ,  0.65706   ],
       [ 0.65706   ,  0.24197934],
       [ 0.24197934,  0.0241073 ],
       [ 0.0241073 , -0.04875838],
       [-0.04875838, -0.10349828],
       [-0.10349828,  0.46818923],
       [ 0.46818923,  0.42214972],
       [ 0.42214972, -0.93184702],
       [-0.93184702,  0.05383361],
       [ 0.05383361,  0.32282037],
       [ 0.32282037,  0.4080116 ],
       [ 0.4080116 ,  0.20645278],
       [ 0.20645278,  0.36777234],
       [ 0.36777234,  0.47543955],
       [ 0.47543955,

size of test_raw data:  21


165     953.66
167     971.40
168     965.14
169     978.76
171     996.47
172     994.13
173    1006.51
174     999.86
175    1001.81
177    1010.04
178    1024.38
179    1026.87
180    1028.70
181    1025.67
183    1038.95
184    1039.87
185    1052.80
186    1046.00
187    1020.04
190     987.78
191     996.19
Name: AMZN, dtype: float64

size of diff test data:  20


0     17.74
1     -6.26
2     13.62
3     17.71
4     -2.34
5     12.38
6     -6.65
7      1.95
8      8.23
9     14.34
10     2.49
11     1.83
12    -3.03
13    13.28
14     0.92
15    12.93
16    -6.80
17   -25.96
18   -32.26
19     8.41
dtype: float64

size of supervised test data:  20


array([[  0.  ,  17.74],
       [ 17.74,  -6.26],
       [ -6.26,  13.62],
       [ 13.62,  17.71],
       [ 17.71,  -2.34],
       [ -2.34,  12.38],
       [ 12.38,  -6.65],
       [ -6.65,   1.95],
       [  1.95,   8.23],
       [  8.23,  14.34],
       [ 14.34,   2.49],
       [  2.49,   1.83],
       [  1.83,  -3.03],
       [ -3.03,  13.28],
       [ 13.28,   0.92],
       [  0.92,  12.93],
       [ 12.93,  -6.8 ],
       [ -6.8 , -25.96],
       [-25.96, -32.26],
       [-32.26,   8.41]])

size of supervised test_scaled data:  20


array([[ 0.14663767,  0.7897408 ],
       [ 0.7897408 , -0.08029726],
       [-0.08029726,  0.64038427],
       [ 0.64038427,  0.78865325],
       [ 0.78865325,  0.06180895],
       [ 0.06180895,  0.5954323 ],
       [ 0.5954323 , -0.09443538],
       [-0.09443538,  0.21732826],
       [ 0.21732826,  0.44498822],
       [ 0.44498822,  0.66648541],
       [ 0.66648541,  0.23690411],
       [ 0.23690411,  0.21297807],
       [ 0.21297807,  0.03679536],
       [ 0.03679536,  0.62805873],
       [ 0.62805873,  0.17998912],
       [ 0.17998912,  0.61537067],
       [ 0.61537067, -0.09987312],
       [-0.09987312, -0.79445351],
       [-0.79445351, -1.0228385 ],
       [-1.0228385 ,  0.4515135 ]])

Fitting the model


In [None]:
# test
#display(amzn_lstm.train_raw)
#display(amzn_lstm.test_raw)
#display(predictions)

# report performance
print('LSTM test RMSE: %.3f' % amzn_lstm.score(predictions))
print('Baseline test RMSE: %.3f' % amzn_lstm.score(amzn_lstm.baseline_predict(end_train_start_test_date, end_test_date)))

# line plot of observed vs predicted

train_index = amzn_lstm.train_raw.index.tolist()
test_index = amzn_lstm.test_raw.index.tolist()

#display(raw_values)
formatter = matplotlib.dates.DateFormatter('%d/%m/%Y')            
plt.plot(amzn.converted_dates[test_index], predictions, ':', marker= 'x', label="Predicted prices")
plt.plot(amzn.converted_dates[set(train_index + test_index)], amzn.get_share_prices(start_train_date, end_test_date), '-', marker= ".", label="Actual prices")
ax = pyplot.gcf().axes[0] 
ax.xaxis.set_major_formatter(formatter)
ax.legend()
plt.gcf().autofmt_xdate(rotation=25)
plt.gcf().set_size_inches(15, 10)
plt.xlabel("Time")
plt.ylabel("Share Price")
plt.show()
