In [1]:
# Normalise features
# Find the optimal parameters C (inverse regulalisation strengths)
# Find the best features to predict stock movement (up or down)
# Convert probability into absolute values (stock % increase/decrease)
# SVC, RandomForest, MinMax, LongShortTermMemory

%matplotlib inline
%pylab inline

# Nice Formatting within Jupyter Notebook
%matplotlib inline
from IPython.display import display # Allows multiple displays from a single code-cell
from jupyterthemes import jtplot

#import classes
from company import Company
from onestep_baseline_company import OneStepBaselineCompany
from onestep_lstm_company import OneStepLSTMCompany
from multistep_baseline_company import MultiStepBaselineCompany
from multistep_lstm_company import MultiStepLSTMCompany

jtplot.style(theme='grade3')
jtplot.style(theme='grade3')
jtplot.style(theme='grade3')


Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
from multistep_lstm_company import MultiStepLSTMCompany
from alpha_vantage.techindicators import TechIndicators
from time import time
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.metrics import mean_squared_error



# No preprocessing in this stage, see if it is good
class MultiStepLSTMCompanyTechIndicatorsWithDifferencing(MultiStepLSTMCompany):
    def __init__(self, name, train_start_date_string, train_end_test_start_date_string, test_end_date_string,
                 n_lag, n_seq, n_epochs, n_batch, n_neurons, tech_indicators=[]):
        self.tech_indicators = TechIndicators(key='3OMS720IM6CRC3SV', output_format='pandas')
        self.all_tech_indicators = ["ad", "adosc", "adx", "adxr", "apo", "aroon", "aroonosc",
                                    "bbands", "bop", "cci", "cmo", "dema", "dx", "ema", "ht_dcperiod",
                                    "ht_dcphase", "ht_phasor", "ht_sine", "ht_trendline", "ht_trendmode",
                                    "kama", "macd", "macdext", "mama", "mfi", "midpoint", "midprice",
                                    "minus_di", "minus_dm", "mom", "natr", "obv", "plus_di", "plus_dm",
                                    "ppo", "roc", "rocr", "rsi", "sar", "sma", "stoch", "stochf", "stochrsi",
                                    "t3", "tema", "trange", "trima", "trix", "ultsoc", "willr", "wma"]
        if tech_indicators == "all":
            self.input_tech_indicators_list = self.all_tech_indicators
        else:
            self.input_tech_indicators_list = tech_indicators
        self.n_indicators = len(tech_indicators)
        self.supervised_pd = None
        # self.all_tech_indicators
        MultiStepLSTMCompany.__init__(self, name, train_start_date_string, train_end_test_start_date_string,
                                      test_end_date_string,
                                      n_lag, n_seq, n_epochs, n_batch, n_neurons)

    def add_tech_indicators_dataframe(self, price_series, indicators):
        combined = price_series
        for ind in indicators:
            print("ind", ind)
            while True:  # try again until success
                try:
                    ind_series = self.get_indicator(ind, self.train_start_date_string, self.test_end_date_string)
                    combined = pd.concat([combined, ind_series], axis=1)
                    break
                except:
                    print("Retrying to download indicator ", ind)
                    pass

        return combined

    def get_indicator(self, ind_name, start, end):
        data, meta_data = getattr(self.tech_indicators, "get_" + ind_name)(self.name, interval="daily")
        data.index = pd.to_datetime(data.index)
        return data

    # create a differenced series
    def difference(self, pd):
        return pd.diff().dropna()

    def preprocess_data(self):
        display("train raw series", self.train_raw_series)
        display("test raw series", self.test_raw_series)
        price_series = self.share_prices_series
        display("price data series", len(price_series), price_series)
        if len(self.input_tech_indicators_list) > 0 and self.supervised_pd is None:
            # add additional technical indicators
            combined = self.add_tech_indicators_dataframe(price_series, self.input_tech_indicators_list)
        else:
            combined = price_series

        # display("combined", combined)
        if self.supervised_pd is None:
            supervised_pd = self.timeseries_to_supervised(combined, self.n_lag, self.n_seq)
            # display("supervised", supervised_pd)
            # delete unnecessary variables for prediction except price (should be var1)
            supervised_pd = self.drop_irrelevant_y_var(supervised_pd)
            display("supervised_pd original", supervised_pd)
            supervised_pd = self.difference(supervised_pd)
            display("supervised_pd after differencing", supervised_pd)
            self.supervised_pd = supervised_pd

        supervised_pd = self.get_filtered_series(self.supervised_pd, self.train_start_date_string, self.test_end_date_string)
        cutoff = len(self.train_raw_series)
        train_supervised_values = supervised_pd.values[:cutoff]
        # display("train supervised values", train_supervised_values)
        test_supervised_values = supervised_pd.values[cutoff:]
        # display("test supervised values", test_supervised_values)

        display("filtered train values", supervised_pd)

        self.scaler, scaled_train_supervised, scaled_test_supervised = self.scale(train_supervised_values,
                                                                                  test_supervised_values)
        display("scaled train supervised", scaled_train_supervised)
        display("scaled test supervised", scaled_test_supervised)

        self.train_scaled, self.test_scaled = scaled_train_supervised, scaled_test_supervised

    def drop_irrelevant_y_var(self, pd):
        columns_to_drop = list()
        for i in range(self.n_indicators):
            columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t)")

        for i in range(self.n_indicators):
            for j in range(1, self.n_seq):
                columns_to_drop.append(self.input_tech_indicators_list[i].upper() + "(t+%d)" % j)
        return pd.drop(columns_to_drop, axis=1)

    # evaluate the persistence model
    def predict(self):
        self.reset()
        # walk-forward validation on the test data
        predictions = pd.Series()
        # Index is datetime
        test_index = self.test_raw_series.index
        print("index", test_index)
        for i in range(len(self.test_scaled)):
            # make multi-step forecast
            X, y = self.test_scaled[i, 0:self.n_lag * (self.n_indicators + 1)], self.test_scaled[i,
                                                                                self.n_lag * (self.n_indicators + 1):]
            print("X: ", X, "y: ", y)
            pred = self.forecast_lstm(X)
            print("Prediction: ", pred)
            # store forecast
            print(test_index[i])
            predictions.at[test_index[i]] = pred
            # display(predictions)

        # display("predictions before inverse transform", predictions)
        # inverse transform
        predictions = self.inverse_transform(self.train_raw_series.append(self.test_raw_series), predictions,
                                             len(self.test_raw_series))
        print("Predictions after inverse transform")
        display(predictions)
        return predictions

    # scale train and test data to [-1, 1]
    def scale(self, train_raw, test_raw):
        # fit scaler with 1 Dimensional array data
        scaler = MinMaxScaler(feature_range=(-1, 1))
        # display("fit scaler with train data", scaler_train_data)
        scaler = scaler.fit(train_raw)
        # transform train
        train_scaled = scaler.transform(train_raw)
        # display("train_scaled", train_scaled)
        # transform test
        test_scaled = scaler.transform(test_raw)
        # display("test_scaled", test_scaled)

        return scaler, train_scaled, test_scaled

    # fit an LSTM network to training data
    def fit_lstm(self, train):
        # reshape training into [samples, timesteps, features]
        X, y = train[:, 0:self.n_lag * (self.n_indicators + 1)], train[:, self.n_lag * (self.n_indicators + 1):]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        display("train X data", X)
        display("train y data", y)
        # design network
        model = Sequential()
        model.add(LSTM(self.n_neurons, batch_input_shape=(self.n_batch, X.shape[1], X.shape[2]), stateful=True))
        model.add(Dense(y.shape[1]))
        model.compile(loss='mean_squared_error', optimizer='adam')
        # fit network
        for i in range(self.n_epochs):
            model.fit(X, y, epochs=1, batch_size=self.n_batch, verbose=0, shuffle=False)
            model.reset_states()
        return model

    # make one forecast with an LSTM,
    def forecast_lstm(self, X):
        # reshape input pattern to [samples, timesteps, features]
        X = X.reshape(1, 1, len(X))
        # make forecast
        forecast = self.lstm_model.predict(X, batch_size=self.n_batch)
        # display("forecast", forecast)
        # convert to array
        return [x for x in forecast[0, :]]

    def reset(self):
        # forecast the entire training dataset to build up state for forecasting
        # reshape training into [samples, timesteps, features]
        self.lstm_model.reset_states()
        X, y = self.train_scaled[:, 0:self.n_lag * (self.n_indicators + 1)], self.train_scaled[:,
                                                                             self.n_lag * (self.n_indicators + 1):]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        self.lstm_model.predict(X, batch_size=self.n_batch)

    # inverse data transform on forecasts
    def inverse_transform(self, series, predictions, n_test):
        # walk-forward validation on the test data
        inverted_predictions = pd.Series()
        pred_index = predictions.index
        for i in range(len(predictions)):
            # create array from forecast
            pred = array([0 for i in range(self.n_lag * (self.n_indicators + 1))] + predictions[i])
            pred = pred.reshape(1, len(pred))
            # display("pred with place holders", pred)
            # invert scaling
            inv_scale = self.scaler.inverse_transform(pred)[0, self.n_lag * (self.n_indicators + 1):]
            # inv_scale = inv_scale[0, :]
            print("Inverse scale  Original Pred: ", pred, "   After Scaling: ", inv_scale)
                        # invert differencing
            # -1 to get the t-1 price
            index = len(series) - n_test + i - 1
            last_ob = series.values[index]
            inv_diff = self.inverse_difference(last_ob, inv_scale)
            # store
            inverted_predictions.at[pred_index[i]] = inv_diff
        return inverted_predictions

    # evaluate the RMSE for each forecast time step
    def score(self, metric, predictions):
        # convert actual tests and predictions to an appropriate list or arrays
        # construct list of rows
        # first item is test data for the next days, hence not taken into account to measure the prediction
        test_values = self.test_raw_series.values[:-self.n_seq+1 if self.n_seq > 1 else len(self.test_raw_series)]
        actual = list()
        for i in range(len(test_values) - self.n_seq + 1):
            next_days_values = test_values[i: i + self.n_seq]
            actual.append(next_days_values)
        actual = np.array(actual)
        # display("actual", actual)

        predictions = np.array(predictions.tolist())
        # display("predicted", predictions)

        if metric == "rmse":
            rmses = list()
            for i in range(self.n_seq):
                # first one is the test data and the next n_seq are predictions
                rmse = math.sqrt(mean_squared_error(actual[:, i], predictions[:, i]))
                print('t+%d RMSE: %f' % ((i + 1), rmse))
                rmses.append(rmse)

            return rmses

        elif metric == "trend":
            # first case is special case since the last data input from the training data is used
            price_1_day_before = self.train_raw_series[-1]
            index = self.test_raw_series.index
            trends = list()
            for i in range(self.n_seq):
                print("\nCalculating trend score for ", i + 1)
                correct_counts = 0
                for j in range(len(predictions)):
                    actual = self.test_raw_series[index[j + i]]
                    if actual > price_1_day_before:
                        true_trend = "up"
                    elif actual < price_1_day_before:
                        true_trend = "down"
                    else:
                        true_trend = "neutral"

                    if predictions[j, i] > price_1_day_before:
                        predicted_trend = "up"
                    elif predictions[j, i] < price_1_day_before:
                        predicted_trend = "down"
                    else:
                        predicted_trend = "neutral"

                    if true_trend == predicted_trend:
                        correct_counts += 1
                    print("Price 1 day before: ", price_1_day_before)
                    print("Actual price: ", actual, " | Predicted price: ", predictions[j, i])
                    print("Actual trend: ", true_trend, " | Predicted trend: ", predicted_trend)
                    # next day
                    price_1_day_before = actual
                price_1_day_before = self.test_raw_series[index[i]]
                print("Correct counts: ", correct_counts, "  Size of test set:", len(self.test_raw_series))
                trends.append(correct_counts / len(self.test_raw_series))
            return trends
        else:
            print(metric, " is not an valid metric. Return NONE")
            return None

In [3]:
start_train_date = "01/04/2018"
end_train_start_test_date = "10/04/2018"
end_test_date = "20/04/2018"

multi_step_lstm_tech_indicator = MultiStepLSTMCompanyTechIndicatorsWithDifferencing("MU", start_train_date, end_train_start_test_date, end_test_date, 
                         n_lag=3, n_seq=1, n_epochs=3000, n_batch=1, n_neurons=4, tech_indicators=[])

multi_step_lstm_tech_indicator.train()

'train raw series'

date
2018-04-02    50.06
2018-04-03    51.55
2018-04-04    53.39
2018-04-05    49.84
2018-04-06    48.46
2018-04-09    47.96
2018-04-10    50.48
Name: Share Price, dtype: float64

'test raw series'

date
2018-04-11    50.48
2018-04-12    52.59
2018-04-13    52.23
2018-04-16    51.65
2018-04-17    52.26
2018-04-18    54.01
2018-04-19    51.42
2018-04-20    50.62
Name: Share Price, dtype: float64

'price data series'

5319

date
1998-01-02    13.940
1998-01-05    13.940
1998-01-06    13.440
1998-01-07    13.250
1998-01-08    14.190
1998-01-09    13.750
1998-01-12    14.315
1998-01-13    16.000
1998-01-14    15.565
1998-01-15    15.780
1998-01-16    14.315
1998-01-20    15.030
1998-01-21    16.065
1998-01-22    16.345
1998-01-23    16.030
1998-01-26    15.565
1998-01-27    16.190
1998-01-28    17.315
1998-01-29    16.905
1998-01-30    17.280
1998-02-02    17.625
1998-02-03    18.905
1998-02-04    19.000
1998-02-05    17.440
1998-02-06    18.030
1998-02-09    17.940
1998-02-10    18.500
1998-02-11    18.690
1998-02-12    17.845
1998-02-13    17.530
               ...  
2019-01-10    35.910
2019-01-11    36.010
2019-01-14    34.670
2019-01-15    33.990
2019-01-16    33.580
2019-01-17    33.880
2019-01-18    35.760
2019-01-22    33.870
2019-01-23    34.240
2019-01-24    36.590
2019-01-25    38.960
2019-01-28    38.080
2019-01-29    37.390
2019-01-30    38.240
2019-01-31    38.220
2019-02-01    39.600
2019-02-

data type <class 'pandas.core.series.Series'>


'supervised_pd original'

Unnamed: 0_level_0,Share Price(t-3),Share Price(t-2),Share Price(t-1),Share Price(t)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-01-07,13.940,13.940,13.440,13.250
1998-01-08,13.940,13.440,13.250,14.190
1998-01-09,13.440,13.250,14.190,13.750
1998-01-12,13.250,14.190,13.750,14.315
1998-01-13,14.190,13.750,14.315,16.000
1998-01-14,13.750,14.315,16.000,15.565
1998-01-15,14.315,16.000,15.565,15.780
1998-01-16,16.000,15.565,15.780,14.315
1998-01-20,15.565,15.780,14.315,15.030
1998-01-21,15.780,14.315,15.030,16.065


'supervised_pd after differencing'

Unnamed: 0_level_0,Share Price(t-3),Share Price(t-2),Share Price(t-1),Share Price(t)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-01-08,0.000,-0.500,-0.190,0.940
1998-01-09,-0.500,-0.190,0.940,-0.440
1998-01-12,-0.190,0.940,-0.440,0.565
1998-01-13,0.940,-0.440,0.565,1.685
1998-01-14,-0.440,0.565,1.685,-0.435
1998-01-15,0.565,1.685,-0.435,0.215
1998-01-16,1.685,-0.435,0.215,-1.465
1998-01-20,-0.435,0.215,-1.465,0.715
1998-01-21,0.215,-1.465,0.715,1.035
1998-01-22,-1.465,0.715,1.035,0.280


'filtered train values'

Unnamed: 0_level_0,Share Price(t-3),Share Price(t-2),Share Price(t-1),Share Price(t)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-04-02,-3.16,-0.91,0.65,-2.08
2018-04-03,-0.91,0.65,-2.08,1.49
2018-04-04,0.65,-2.08,1.49,1.84
2018-04-05,-2.08,1.49,1.84,-3.55
2018-04-06,1.49,1.84,-3.55,-1.38
2018-04-09,1.84,-3.55,-1.38,-0.5
2018-04-10,-3.55,-1.38,-0.5,2.52
2018-04-11,-1.38,-0.5,2.52,0.0
2018-04-12,-0.5,2.52,0.0,2.11
2018-04-13,2.52,0.0,2.11,-0.36


'scaled train supervised'

array([[-0.85528757, -0.02040816,  0.55844156, -0.51565074],
       [-0.02040816,  0.55844156, -0.45454545,  0.66062603],
       [ 0.55844156, -0.45454545,  0.87012987,  0.77594728],
       [-0.45454545,  0.87012987,  1.        , -1.        ],
       [ 0.87012987,  1.        , -1.        , -0.28500824],
       [ 1.        , -1.        , -0.19480519,  0.00494234],
       [-1.        , -0.19480519,  0.13172542,  1.        ]])

'scaled test supervised'

array([[-0.19480519,  0.13172542,  1.25231911,  0.16968699],
       [ 0.13172542,  1.25231911,  0.31725417,  0.86490939],
       [ 1.25231911,  0.31725417,  1.10018553,  0.05107084],
       [ 0.31725417,  1.10018553,  0.18367347, -0.0214168 ],
       [ 1.10018553,  0.18367347,  0.10204082,  0.37067545],
       [ 0.18367347,  0.10204082,  0.54359926,  0.74629325],
       [ 0.10204082,  0.54359926,  0.96660482, -0.68369028],
       [ 0.54359926,  0.96660482, -0.64378479, -0.09390445]])

Fitting the model


'train X data'

array([[[-0.85528757, -0.02040816,  0.55844156]],

       [[-0.02040816,  0.55844156, -0.45454545]],

       [[ 0.55844156, -0.45454545,  0.87012987]],

       [[-0.45454545,  0.87012987,  1.        ]],

       [[ 0.87012987,  1.        , -1.        ]],

       [[ 1.        , -1.        , -0.19480519]],

       [[-1.        , -0.19480519,  0.13172542]]])

'train y data'

array([[-0.51565074],
       [ 0.66062603],
       [ 0.77594728],
       [-1.        ],
       [-0.28500824],
       [ 0.00494234],
       [ 1.        ]])

Finished fitting the model, time taken to train: 35.7 s


In [4]:
predictions = multi_step_lstm_tech_indicator.predict()
#

trend_score = multi_step_lstm_tech_indicator.score(metric="trend", predictions=predictions)
print("LSTM trend Score: ", trend_score)
lstm_score = multi_step_lstm_tech_indicator.score(metric="rmse", predictions=predictions)
print("LSTM RMSE Score: US dollar", lstm_score)

index DatetimeIndex(['2018-04-11', '2018-04-12', '2018-04-13', '2018-04-16',
               '2018-04-17', '2018-04-18', '2018-04-19', '2018-04-20'],
              dtype='datetime64[ns]', name='date', freq=None)
X:  [-0.19480519  0.13172542  1.25231911] y:  [0.16968699]
Prediction:  [0.46558768]
2018-04-11 00:00:00
X:  [0.13172542 1.25231911 0.31725417] y:  [0.86490939]
Prediction:  [0.7063328]
2018-04-12 00:00:00
X:  [1.25231911 0.31725417 1.10018553] y:  [0.05107084]
Prediction:  [0.96913946]
2018-04-13 00:00:00
X:  [0.31725417 1.10018553 0.18367347] y:  [-0.0214168]
Prediction:  [1.6463481]
2018-04-16 00:00:00
X:  [1.10018553 0.18367347 0.10204082] y:  [0.37067545]
Prediction:  [1.8868376]
2018-04-17 00:00:00
X:  [0.18367347 0.10204082 0.54359926] y:  [0.74629325]
Prediction:  [1.9602857]
2018-04-18 00:00:00
X:  [0.10204082 0.54359926 0.96660482] y:  [-0.68369028]
Prediction:  [1.6106778]
2018-04-19 00:00:00
X:  [ 0.54359926  0.96660482 -0.64378479] y:  [-0.09390445]
Prediction:  [2.

2018-04-11     [51.37805859535932]
2018-04-12      [52.1087200564146]
2018-04-13      [55.0163382512331]
2018-04-16    [56.711666540503494]
2018-04-17      [56.8615521210432]
2018-04-18    [57.694466989040365]
2018-04-19     [58.38340723931788]
2018-04-20     [57.17487506985664]
dtype: object


Calculating trend score for  1
Price 1 day before:  50.48
Actual price:  50.48  | Predicted price:  51.37805859535932
Actual trend:  neutral  | Predicted trend:  up
Price 1 day before:  50.48
Actual price:  52.59  | Predicted price:  52.1087200564146
Actual trend:  up  | Predicted trend:  up
Price 1 day before:  52.59
Actual price:  52.23  | Predicted price:  55.0163382512331
Actual trend:  down  | Predicted trend:  up
Price 1 day before:  52.23
Actual price:  51.65  | Predicted price:  56.711666540503494
Actual trend:  down  | Predicted trend:  up
Price 1 day before:  51.65
Actual price:  52.26  | Predicted price:  56.8615521210432
Actual trend:  up  | Predicted trend:  up
Price 1 day before:  52.26
Actual price:  54.01  | Predicted price:  57.694466989040365
Actual trend:  up  | Predicted trend:  up
Price 1 day before:  54.01
Actual price:  51.42  | Predicted price:  58.38340723931788
Actual trend:  down  | Predicted trend:  up
Price 1 day before:  51.42
Actual price:  50.62  | Pred

In [5]:
multi_step_lstm_tech_indicator.save_lstm_model()
multi_step_lstm_tech_indicator.save_object()