In [1]:
# Normalise features
# Find the optimal parameters C (inverse regulalisation strengths)
# Find the best features to predict stock movement (up or down)
# Convert probability into absolute values (stock % increase/decrease)
# SVC, RandomForest, MinMax, LongShortTermMemory

%matplotlib inline
%pylab inline

# Nice Formatting within Jupyter Notebook
%matplotlib inline
from IPython.display import display # Allows multiple displays from a single code-cell

import os
import sys
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib
import matplotlib.pyplot as plt
import datetime
import math

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from collections import OrderedDict
from sklearn.metrics import mean_squared_error
from alpha_vantage.timeseries import TimeSeries


Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.


In [2]:
class Company(object):
    def __init__(self, name):
        self.name = name
        # Create object to request data from Alpha Vantage
        self.time_series = TimeSeries(key='3OMS720IM6CRC3SV', output_format='pandas', indexing_type='date')
        data, metadata = self.time_series.get_daily_adjusted(symbol=name, outputsize='full')
        # Convert index of the DataFrame which is in the date string format into datetime
        data.index = pd.to_datetime(data.index)
        self.converted_dates = data.index # DateTimeIndex64
        self.share_prices_series = data["5. adjusted close"] # Series
    
    def convert_date_string_to_datetime(self, date_string):
        date_day, date_month, date_year = date_string.split("/")
        return datetime.datetime(int(date_year), int(date_month), int(date_day), 0, 0)
    
    # Get share prices within the range
    def get_share_prices(self, start_date_string=None, end_date_string=None, start_delay=None):
        # When no date parameters are passed
        if start_date_string == None and end_date_string == None:
            return non_nan_share_prices
        else:
            # Check whether there needs "days" delay in the returned share prices
            if start_delay != None:
                start_date = self.convert_date_string_to_datetime(start_date_string) + datetime.timedelta(days=start_delay)
            else:
                start_date = self.convert_date_string_to_datetime(start_date_string)
            end_date = self.convert_date_string_to_datetime(end_date_string)
            revelant_dates = self.converted_dates[(self.converted_dates>=start_date) & (self.converted_dates<=end_date)]
            relevant_share_prices = self.share_prices_series[
                (self.share_prices_series.index>=start_date) & (self.share_prices_series.index<=end_date)]
            return relevant_share_prices
        
    # plot function for children classes, if run by parent, error would happen
    def plot(self, predictions):
        # line plot of observed vs predicted
        formatter = matplotlib.dates.DateFormatter('%d/%m/%Y')            
        plt.plot(predictions.index, predictions.values, ':', marker= 'x', label="Predicted prices")
        plt.plot(self.train_raw_series.index.append(self.test_raw_series.index), 
                 np.append(self.train_raw_series.values, self.test_raw_series.values),
                 '-', marker= ".", label="Actual prices")
        ax = pyplot.gcf().axes[0] 
        ax.xaxis.set_major_formatter(formatter)
        ax.legend()
        plt.gcf().autofmt_xdate(rotation=25)
        plt.gcf().set_size_inches(15, 10)
        plt.xlabel("Time")
        plt.ylabel("Share Price ($)")
        plt.title("Stock price prediction for " + self.name)
        plt.show()
        
    # score function for children classes, if run by parent, error would happen
    def score(self, metric, predictions):
        if len(self.test_raw_series) != len(predictions):
            raise ValueError("Len of test data is not equal the length of predicted data")
            
        # predictions and self.test_raw_series are series with index representing its original index in the dataset
        # root mean squared error
        if metric == "rmse":
            rmse = sqrt(mean_squared_error(self.test_raw_series, predictions))
            return rmse
        # trend whether the prediction for the next day is up or down and its accuracy
        elif metric == "trend":
            # first case is special case since the last data input from the training data is used
            price_1_day_before = self.train_raw_series[self.train_raw_series.index.tolist()[-1]]
            correct_counts = 0
            index = self.test_raw_series.index
            for i in range(1, len(self.test_raw_series)):
                if self.test_raw_series[index[i]] > price_1_day_before:
                    true_trend = "up"
                elif self.test_raw_series[index[i]] < price_1_day_before:
                    true_trend = "down"
                else:
                    true_trend = "neutral"
                
                if predictions[index[i]] > price_1_day_before:
                    predicted_trend = "up"
                elif predictions[index[i]] < price_1_day_before:
                    predicted_trend = "down"
                else:
                    predicted_trend = "neutral"
                
                if true_trend == predicted_trend:
                    correct_counts += 1
                #print("Price 1 day before", price_1_day_before)
                #print("Actual price: ", self.test_raw_series[index[i]], " | Predicted price: ", predictions[index[i]])
                #print("Actual trend: ", true_trend, " | Predicted trend: ", predicted_trend)
                # next day
                price_1_day_before = self.test_raw_series[index[i]]
            
            return correct_counts/len(self.test_raw_series)
   

In [3]:
class CompanyBaseline(Company):
    def __init__(self, name, train_start_date_string, train_end_test_start_date_string, test_end_date_string):
        Company.__init__(self, name)
        self.train_raw_series = self.get_share_prices(train_start_date_string, train_end_test_start_date_string)
        self.test_raw_series = self.get_share_prices(train_end_test_start_date_string, test_end_date_string, start_delay=1)

    def train(self):
        pass

    def predict(self):
        #Persistence Model Forecast, basically, the same share price with 1 date timelag
        # e.g. the predicted share price at time t, is t-1
        predictions = self.test_raw_series.shift(1)
        # Special case of the first value is changed to the zero 
        predictions.at[predictions.index[0]] = self.train_raw_series.values[-1]
    
        return predictions
    


In [11]:
start_train_date = "01/01/2017"
end_train_start_test_date = "01/02/2017"
end_test_date = "01/02/2017"

In [12]:
company_baseline = CompanyBaseline("BLK", start_train_date, end_train_start_test_date, end_test_date)
company_baseline.train()
predictions = company_baseline.predict()
company_baseline.plot(predictions)
rmse_score = company_baseline.score(metric="rmse", predictions=predictions)
print("Baseline RMSE Score: %.3f US dollar" % rmse_score)
trend_score = company_baseline.score(metric="trend", predictions=predictions)
print("Baseline trend Score: %.1f %%" % (trend_score*100))

IndexError: index 0 is out of bounds for axis 0 with size 0

In [9]:
class CompanyLSTM(Company):
    def __init__(self, name, train_start_date_string, train_end_test_start_date_string, test_end_date_string):
        Company.__init__(self, name, )
        self.lstm_model = None
        self.scaler = None
        self.train_raw_series = self.get_share_prices(train_start_date_string, train_end_test_start_date_string)
        self.test_raw_series = self.get_share_prices(train_end_test_start_date_string, test_end_date_string, start_delay=1)
        self.train_scaled, self.test_scaled = self.preprocess_data(train_start_date_string, train_end_test_start_date_string, test_end_date_string)

    # adapted from https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/
    def timeseries_to_supervised(self, data, lag=1):
        df = pd.DataFrame(data)
        columns = [df.shift(i) for i in range(1, lag+1)]
        columns.append(df)
        df = pd.concat(columns, axis=1)
        df.fillna(0, inplace=True)
        #print("Timeseries to supervised")
        #display(df)
        return df
    
    # create a differenced series
    def difference(self, series, source, interval=1):
        diff = list()
        for i in range(1, len(series)):
            value = series[i] - series[i-1]
            diff.append(value)
        
        # Last item is special case because there is no next value thus the diff is 
        # 1 size shorter than the original test_raw. We fix this by adding an additional item
        if source == "test":
            diff.append(0)
        return pd.Series(diff)
    
    # invert differenced value
    def inverse_difference(self, history, yhat, interval=1):
        #print("interval", interval)
        #display(history)
        return yhat + history.values[-interval]

    # scale train and test data to [-1, 1]
    def scale(self, train, test):
        # fit scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(train)
        # transform train
        train = train.reshape(train.shape[0], train.shape[1])
        train_scaled = scaler.transform(train)
        # transform test
        test = test.reshape(test.shape[0], test.shape[1])
        test_scaled = scaler.transform(test)
        return scaler, train_scaled, test_scaled

    # inverse scaling for a forecasted value
    def invert_scale(self, X, value):
        new_row = [x for x in X] + [value]
        array = numpy.array(new_row)
        array = array.reshape(1, len(array))
        inverted = self.scaler.inverse_transform(array)
        return inverted[0, -1]

    def preprocess_data(self, train_start_date_string, train_end_test_start_date_string, test_end_date_string):
        # transform data to be stationary
        train_diff_values = self.difference(self.train_raw_series.values, "train", 1)
        test_diff_values = self.difference(self.test_raw_series.values, "test", 1)
        
        # transform data to be supervised learning
        train_supervised_pd = self.timeseries_to_supervised(train_diff_values, 1)
        train = train_supervised_pd.values

        test_supervised_pd = self.timeseries_to_supervised(test_diff_values, 1)
        test = test_supervised_pd.values
        
        # transform the scale of the data
        scaler, train_scaled, test_scaled = self.scale(train, test)
        self.scaler = scaler
        
        print("size of train_raw data: ", len(self.train_raw_series))
        display(self.train_raw_series)
        print("size of diff train data: ", len(train_diff_values))
        display(train_diff_values)
        """
        print("size of supervised train data: ", len(train))
        display(train)
        print("size of supervised train_scaled data: ", len(train_scaled))
        display(train_scaled)
        print("size of test_raw data: ", len(self.test_raw_series))
        display(self.test_raw_series)
        print("size of diff test data: ", len(test_diff_values))
        display(test_diff_values)
        print("size of supervised test data: ", len(test))
        display(test)
        print("size of supervised test_scaled data: ", len(test_scaled))
        display(test_scaled)
        """
        return train_scaled, test_scaled
    
    # fit the model
    def train(self):        
        print("Fitting the model")
        self.lstm_model = self.fit_lstm(self.train_scaled, 1, 3000, 4)
        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = self.train_scaled[:, 0].reshape(len(self.train_scaled), 1, 1)
        self.lstm_model.predict(train_reshaped, batch_size=1)
        print("Finished fitting the model")
        
    # fit an LSTM network to training data
    def fit_lstm(self, train, batch_size, nb_epoch, neurons):
        X, y = train[:, 0:-1], train[:, -1]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        model = Sequential()
        model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        for i in range(nb_epoch):
            model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
            model.reset_states()
        return model

    # make a one-step forecast
    def forecast_lstm(self, batch_size, X):
        X = X.reshape(1, 1, len(X))
        pred = self.lstm_model.predict(X, batch_size=batch_size)
        return pred[0,0]
    
    def predict(self):
        # walk-forward validation on the test data
        predictions = pd.Series()
        test_index = self.test_raw_series.index
        #predict the fist share price after the last share price in the training data
        #pred = self.forecast_lstm(1, self.train_scaled[i, 0:-1])
        for i in range(len(self.test_scaled)):
            # make one-step forecast
            X, y = self.test_scaled[i, 0:-1], self.test_scaled[i, -1]
            #print("X: ", X, "y: ", y)
            pred = self.forecast_lstm(1, X)
            # invert scaling
            pred = self.invert_scale(X, pred)
            # invert differencing
            pred = self.inverse_difference(self.test_raw_series, pred, len(self.test_scaled)-i)
            # store forecast
            predictions.at[test_index[i]] = pred
            
            expected = self.invert_scale(X, y)
            expected = self.inverse_difference(self.test_raw_series, expected, len(self.test_scaled)-i)
            
            exp = self.test_raw_series[test_index[i]]
            #print('Predicted=%f, Expected Raw = %f' % (pred, exp))
        display("len(test)", len(self.test_scaled))
        display("len(predictions)", len(predictions))

        return predictions

    

In [13]:
company_lstm = CompanyLSTM("JD", start_train_date, end_train_start_test_date, end_test_date)

company_lstm.train()
predictions = company_lstm.predict()
company_lstm.plot(predictions)
lstm_score = company_lstm.score(metric="rmse", predictions=predictions)
print("LSTM RMSE Score: %.3f US dollar" % lstm_score)
trend_score = company_lstm.score(metric="trend", predictions=predictions)
print("LSTM trend Score: %.1f %%" % (trend_score*100))

size of train_raw data:  21


date
2017-01-03    25.82
2017-01-04    25.85
2017-01-05    26.30
2017-01-06    26.27
2017-01-09    26.26
2017-01-10    26.90
2017-01-11    26.77
2017-01-12    26.61
2017-01-13    26.84
2017-01-17    27.21
2017-01-18    27.16
2017-01-19    27.75
2017-01-20    27.60
2017-01-23    28.18
2017-01-24    28.54
2017-01-25    28.59
2017-01-26    28.32
2017-01-27    28.36
2017-01-30    28.56
2017-01-31    28.40
2017-02-01    28.13
Name: 5. adjusted close, dtype: float64

size of diff train data:  20


0     0.03
1     0.45
2    -0.03
3    -0.01
4     0.64
5    -0.13
6    -0.16
7     0.23
8     0.37
9    -0.05
10    0.59
11   -0.15
12    0.58
13    0.36
14    0.05
15   -0.27
16    0.04
17    0.20
18   -0.16
19   -0.27
dtype: float64

Fitting the model
Finished fitting the model


IndexError: index -1 is out of bounds for axis 0 with size 0

In [14]:
company_lstm.forecast_lstm(1, company_lstm.train_raw_series.tail(1))

start_train_date = "01/01/2017"
end_train_start_test_date = "01/06/2017"
end_test_date = "01/01/2018"


AttributeError: 'Series' object has no attribute 'reshape'

In [None]:
company_lstm = CompanyLSTM("BLK", pd_processed[get_relevant_columns_for_company("BLK")])
train_scaled, test_scaled = company_lstm.preprocess_data(start_train_date, end_train_start_test_date, end_test_date)
company_lstm.train()
predictions = company_lstm.predict()
company_lstm.plot(predictions)
lstm_score = company_lstm.score(metric="rmse", predictions=predictions)
print("LSTM RMSE Score: %.3f US dollar" % lstm_score)
trend_score = company_lstm.score(metric="trend", predictions=predictions)
print("LSTM trend Score: %.1f %%" % (trend_score*100))

In [None]:
ts = TimeSeries(key='3OMS720IM6CRC3SV', output_format='pandas', indexing_type='date')
data, meta_data = ts.get_daily_adjusted(symbol='MSFT', outputsize='full')

In [None]:
display(type(data))
data.index = pd.to_datetime(data.index)

data['5. adjusted close'].plot()
plt.title('Adjusted Close Times Series for the MSFT stock (daily)')
plt.show()