# A Simple System Design for Forecasting Time Series Stock Data

# Imports

In [26]:
import math
import sklearn
import numpy as np
import pandas as pd
import yfinance as yf


from abc import ABC
from typing import List
from abc import abstractmethod
from dataclasses import dataclass

from keras.layers import Dense
from keras.models import Sequential
# from sklearn.metrics import mean_squared_error

# Data Processor - Load the stock data

1. Retrieve the stock data with `data = yf.download(symbol, start="YYYY-MM-DD", end="YYYY-MM-DD")`
2. Specify what column we're seeking to explore to get the univariate time series (uts)
3. Convert the univariate time series (uts) to supervised machine learning (sml)

In [2]:
@dataclass
class YFinanceDataProcessor:
    """Handles fetching and returning data. """
    ticker_symbol: str
    start_date: str
    end_date: str
    
    def __post_init__(self):
        """Operations that are performed after the initialization step."""
        self.data = yf.download(self.ticker_symbol, start=self.start_date, end=self.end_date)
    
    def filter_by_col(self, col_name: str):
        """Returns a filtered univariate dataframe with all the rows.
        
        Parameter:
        col_name -- str (that specifies the column) 
        
        Return:
        -- pd DataFrame (of the univariate ts data)
        """
        univariate_ts_df = self.data.loc[:, [col_name]]
        return univariate_ts_df
    
    def convert_uts_sequence_to_sml_with_pd(self, uts_observations: pd.DataFrame, prior_observations: int, forecasting_step: int):
        """Splits a given UTS into multiple input rows where each input row has a specified number of timestamps and the output is a single timestamp.

        Parameters:
        uts_observations -- pd DataFrame (of UTS data to transform to SML data with size  b rows/length x 1 dimension)
        prior_observations -- py int (of all observations before we get to where we want to start making the predictions)
        forecasting_step -- py int (of how far out to forecast, 1 only the next timestamp, 2 the next two timestamps, ... n the next n timestamps)

        Return:
        -- pd DataFrame (of the sml data)
        """
        observations_df = pd.DataFrame(uts_observations)
        cols = list()
        lag_col_names = []
        
        # print("Input Univariate Time Series:")
        # print(uts_observations, "\nX of size", np.shape(uts_observations))
        # print()

        # input sequence (t-n, ... t-1)
        # name columns for sml df
        for prior_observation in range(prior_observations, 0, -1):
            # print("prior_observation: ", prior_observation)
            
            cols.append(observations_df.shift(prior_observation))
            new_col_name = "t-" + str(prior_observation)
            # print(new_col_name)
            lag_col_names.append(new_col_name)


        # forecast sequence (t, t+1, ... t+n)
        # name columns for sml df
        for i in range(0, forecasting_step):
            cols.append(observations_df.shift(-i))

            new_col_name = "t" 
            if forecasting_step == 1:
                # print(new_col_name)
                lag_col_names.append(new_col_name)

            else:
                if i == 0:
                    lag_col_names.append(new_col_name)
                else:
                    new_col_name = "t+" + str(i)
                    # print(new_col_name)
                    lag_col_names.append(new_col_name)

            # put observation cols together and add column names
            uts_sml_df = pd.concat(cols, axis=1) 
            uts_sml_df.columns=[lag_col_names]
            
            # drop rows with NaN values
            uts_sml_df.dropna(inplace=True)
            sml_df = uts_sml_df.reset_index(drop=True)
            
        return sml_df
    
    def train_test_split(self, uts_sml_df: pd.DataFrame):
        
        # print(uts_sml_df)
        # print("Univariate Time Series as Supervised Machine Learning:")
        # colums to use to make prediction for last col

        X_train = uts_sml_df.iloc[:, :prior_observations]
        # print("X_train: \n", X_train)

        # last column
        y_train = uts_sml_df.iloc[:, -forecasting_step:]
        # print("y_train: \n", y_train)
        
        # Make a prediction for this data
        x_input = uts_sml_df.iloc[-1:, forecasting_step:]
        
        return X_train, y_train, x_input

In [3]:
# Set the start and end dates for the data
symbol = "VOO"
# symbol = "BRK-A"
start_date = "2022-01-01"
end_date = "2023-06-25"

data_processor = YFinanceDataProcessor(symbol, start_date, end_date)
# data_processor.data
uts_observations = data_processor.filter_by_col("Open")
uts_observations

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2022-01-03,437.929993
2022-01-04,440.579987
2022-01-05,438.779999
2022-01-06,430.200012
2022-01-07,430.209991
...,...
2023-06-16,408.690002
2023-06-20,403.489990
2023-06-21,402.309998
2023-06-22,400.279999


- Beware of the dates. We want to predict the next day, so we have to split the data as such. 
- `up_to_today`: historical data to train
- `tomorrow`: the true label for the next day

In [4]:
up_to_today = uts_observations[:-1]
up_to_today

Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2022-01-03,437.929993
2022-01-04,440.579987
2022-01-05,438.779999
2022-01-06,430.200012
2022-01-07,430.209991
...,...
2023-06-15,401.000000
2023-06-16,408.690002
2023-06-20,403.489990
2023-06-21,402.309998


In [5]:
tomorrow = uts_observations[-1:]
tomorrow

Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2023-06-23,399.329987


In [6]:
tomorrow = tomorrow.values
tomorrow

array([[399.32998657]])

In [7]:
prior_observations, forecasting_step = [3, 1]
sml_samples_df = data_processor.convert_uts_sequence_to_sml_with_pd(up_to_today, prior_observations, forecasting_step)
sml_samples_df

Unnamed: 0,t-3,t-2,t-1,t
0,437.929993,440.579987,438.779999,430.200012
1,440.579987,438.779999,430.200012,430.209991
2,438.779999,430.200012,430.209991,425.380005
3,430.200012,430.209991,425.380005,427.679993
4,430.209991,425.380005,427.679993,433.559998
...,...,...,...,...
361,395.980011,400.019989,401.609985,401.000000
362,400.019989,401.609985,401.000000,408.690002
363,401.609985,401.000000,408.690002,403.489990
364,401.000000,408.690002,403.489990,402.309998


- Matrix X shape is [samples, features] $ \Rightarrow $ [366, 3] $ \Rightarrow $ vector x shape is [1, 3]
- vector y shape is [samples, 1] $ \Rightarrow $ [366, 1] $ \Rightarrow $ scalar y shape is [1, 1]

In [8]:
X = sml_samples_df.iloc[:, 0:3]
X

Unnamed: 0,t-3,t-2,t-1
0,437.929993,440.579987,438.779999
1,440.579987,438.779999,430.200012
2,438.779999,430.200012,430.209991
3,430.200012,430.209991,425.380005
4,430.209991,425.380005,427.679993
...,...,...,...
361,395.980011,400.019989,401.609985
362,400.019989,401.609985,401.000000
363,401.609985,401.000000,408.690002
364,401.000000,408.690002,403.489990


In [9]:
y = sml_samples_df.iloc[0:, -1]
y

0      430.200012
1      430.209991
2      425.380005
3      427.679993
4      433.559998
          ...    
361    401.000000
362    408.690002
363    403.489990
364    402.309998
365    400.279999
Name: (t,), Length: 366, dtype: float64

# Machine Learning Models

In [10]:
@dataclass
class Model(ABC):
    """This is a base class with an abstract-like implementation. Each specified model inherits from this base class.
    
    Methods decorated with @abstractmethod must be implemented; if not, the interpreter will throw
    an error. Methods not decorated will be shared by all other classes that inherit from Model.
    """
    
    @abstractmethod
    def __name__(self):
        pass
    
    @abstractmethod
    def define_model_with_layers(self):
        pass
    
    @abstractmethod
    def fit_model(self):
        pass
    
    @abstractmethod
    def predict(self):
        pass
    
    def augment_data(self):
        pass

In [21]:
class MLP(Model):
    
    
    def __name__(self):
        return "MLP"
    
    def define_model_with_layers(self, prior_observations, forecasting_step):
        model = Sequential()
        
        layer_1 = Dense(100, activation='relu', input_dim=prior_observations, name="1-input_layer")
        model.add(layer_1)
        layer_1_weights = layer_1.get_weights()

        layer_2 = Dense(forecasting_step, name="2-forecasting_layer")
        model.add(layer_2)
        layer_2_weights = layer_2.get_weights()
        
        model.compile(optimizer='adam', loss='mse')
        return model, model.summary()
    
    def fit_model(self, model_to_fit, X_train, y_train):
        model_to_fit.fit(X_train, y_train, epochs=2000, verbose=0)
        
        return model_to_fit
    
    def predict(self, fitted_model, x_input, prior_observations):
        """
        Parameters:
        fitted_model -- keras model
        x_input -- pd series
        prior_observations -- int
        
        Returns
        -- int (of the prediction at the next time step(s))
        """
        x_input = x_input.values
        x_input = x_input.reshape((1, prior_observations))
        yhat = fitted_model.predict(x_input)
        
        return yhat

In [12]:
mlp_model = MLP()

In [13]:
model, model_defined = mlp_model.define_model_with_layers(prior_observations, forecasting_step)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 1-input_layer (Dense)       (None, 100)               400       
                                                                 
 2-forecasting_layer (Dense)  (None, 1)                101       
                                                                 
Total params: 501
Trainable params: 501
Non-trainable params: 0
_________________________________________________________________


2023-06-25 13:44:58.080523: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


- Map each vector x sample to each scalar y true sample
- ie:
    1. sample 0 for vector x and scalar y $ \Rightarrow $ vector x [437.929993, 440.579987, 438.779999] maps to scalar y [430.200012]
    2. sample 1 for vector x and scalar y $ \Rightarrow $ vector x [440.579987, 438.779999, 430.200012] maps to scalar y [430.209991]

In [14]:
model_fitted = mlp_model.fit_model(model, X, y)

In [15]:
model_fitted

<keras.engine.sequential.Sequential at 0x14c6e1e20>

- Map the last 3 vector x samples (days) to the next scalar y true sample (day)
- ie:
    1. sample 1 for vector x and scalar y $ \Rightarrow $ vector x [403.489990, 402.309998, 400.279999] maps to scalar y [399.329987]

In [19]:
x_input = sml_samples_df.iloc[-1, -prior_observations:]
x_input

t-2    403.489990
t-1    402.309998
t      400.279999
Name: 365, dtype: float64

In [20]:
tomorrow

array([[399.32998657]])

In [22]:
model_prediction = mlp_model.predict(model_fitted, x_input, prior_observations)



In [23]:
model_prediction

array([[399.91702]], dtype=float32)

In [28]:
mse = math.sqrt(sklearn.metrics.mean_squared_error(tomorrow, model_prediction))
mse

0.5870361328125