# A Simple System Design for Forecasting Time Series Stock Data

# Imports

In [1]:
import math
import numpy as np
import pandas as pd
import yfinance as yf

from abc import ABC
from typing import List
from abc import abstractmethod
from dataclasses import dataclass

from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import mean_squared_error

2023-06-16 22:15:41.941811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Processor - Load the stock data

- Retrieve the stock data with `data = yf.download(symbol, start="YYYY-MM-DD", end="YYYY-MM-DD")`
- Specific what column we're seeking to explore

In [2]:
@dataclass
class YFinanceDataProcessor:
    """Handles fetching and returning data. """
    ticker_symbol: str
    start_date: str
    end_date: str
    
    def __post_init__(self):
        """Operations that are performed after the initialization step."""
        self.data = yf.download(self.ticker_symbol, start=self.start_date, end=self.end_date)
    
    def filter_by_col(self, col_name: str):
        """Returns a filtered univariate dataframe with all the rows.
        
        Parameter:
        col_name -- str (that specifies the column) 
        
        Return:
        -- pd DataFrame (of the univariate ts data)
        """
        univariate_ts_df = self.data.loc[:, [col_name]]
        return univariate_ts_df
    
    def convert_uts_sequence_to_sml_with_pd(self, uts_observations: pd.DataFrame, prior_observations: int, forecasting_step: int):
        """Splits a given UTS into multiple input rows where each input row has a specified number of timestamps and the output is a single timestamp.

        Parameters:
        uts_observations -- pd DataFrame (of UTS data to transform to SML data with size  b rows/length x 1 dimension)
        prior_observations -- py int (of all observations before we get to where we want to start making the predictions)
        forecasting_step -- py int (of how far out to forecast, 1 only the next timestamp, 2 the next two timestamps, ... n the next n timestamps)

        Return:
        -- pd DataFrame (of the sml data)
        """
        df = pd.DataFrame(uts_observations)
        cols = list()

        lag_col_names = []
        count_lag = 0

        # print("Input Univariate Time Series:")
        # print(uts_observations, "\nX of size", np.shape(uts_observations))
        # print()

        # input sequence (t-n, ... t-1)
        for prior_observation in range(prior_observations, 0, -1):
            # print("prior_observation: ", prior_observation)
            cols.append(df.shift(prior_observation))
            new_col_name = "t-" + str(prior_observation)
            # print(new_col_name)
            lag_col_names.append(new_col_name)


        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, forecasting_step):
            cols.append(df.shift(-i))

            new_col_name = "t" 
            if forecasting_step == 1:
                # print(new_col_name)
                lag_col_names.append(new_col_name)

            else:
                if i == 0:
                    lag_col_names.append(new_col_name)
                else:
                    new_col_name = "t+" + str(i)
                    # print(new_col_name)
                    lag_col_names.append(new_col_name)

            # put it all together
            uts_sml_df = pd.concat(cols, axis=1) 
            uts_sml_df.columns=[lag_col_names]
            
            # drop rows with NaN values
            uts_sml_df.dropna(inplace=True)
        return uts_sml_df
    
    def train_test_split(self, uts_sml_df: pd.DataFrame):
        
        # print(uts_sml_df)
        # print("Univariate Time Series as Supervised Machine Learning:")
        # colums to use to make prediction for last col

        X_train = uts_sml_df.iloc[:, :prior_observations]
        # print("X_train: \n", X_train)

        # last column
        y_train = uts_sml_df.iloc[:, -forecasting_step:]
        # print("y_train: \n", y_train)
        
        # Make a prediction for this data
        x_input = uts_sml_df.iloc[-1:, forecasting_step:]
        
        return X_train, y_train, x_input

In [3]:
# Set the start and end dates for the data
symbol = "VOO"
# symbol = "BRK-A"
start_date = "2022-01-01"
end_date = "2023-06-25"


data_processor = YFinanceDataProcessor(symbol, start_date, end_date)
# data_processor.data
uts_observations = data_processor.filter_by_col("Open")
uts_observations

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open
Date,Unnamed: 1_level_1
2022-01-03,437.929993
2022-01-04,440.579987
2022-01-05,438.779999
2022-01-06,430.200012
2022-01-07,430.209991
...,...
2023-06-12,395.980011
2023-06-13,400.019989
2023-06-14,401.609985
2023-06-15,401.000000


In [4]:
prior_observations, forecasting_step = [3, 1]
sml_data = data_processor.convert_uts_sequence_to_sml_with_pd(uts_observations, prior_observations, forecasting_step)
sml_data

Unnamed: 0_level_0,t-3,t-2,t-1,t
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-06,437.929993,440.579987,438.779999,430.200012
2022-01-07,440.579987,438.779999,430.200012,430.209991
2022-01-10,438.779999,430.200012,430.209991,425.380005
2022-01-11,430.200012,430.209991,425.380005,427.679993
2022-01-12,430.209991,425.380005,427.679993,433.559998
...,...,...,...,...
2023-06-12,393.670013,391.959991,395.049988,395.980011
2023-06-13,391.959991,395.049988,395.980011,400.019989
2023-06-14,395.049988,395.980011,400.019989,401.609985
2023-06-15,395.980011,400.019989,401.609985,401.000000


In [5]:
train_x, train_y, x_input = data_processor.train_test_split(sml_data)

# Machine Learning Models

In [6]:
@dataclass
class Model(ABC):
    """This is a base class with an abstract-like implementation. Each specified model inherits from this base class.
    
    Methods decorated with @abstractmethod must be implemented; if not, the interpreter will throw
    an error. Methods not decorated will be shared by all other classes that inherit from Model.
    """
    
    @abstractmethod
    def __name__(self):
        pass
    
    @abstractmethod
    def define_model_with_layers(self):
        pass
    
    @abstractmethod
    def fit_model(self):
        pass
    
    @abstractmethod
    def predict(self):
        pass
    
    def augment_data(self):
        pass
    
    
    

In [7]:
class MLP(Model):
    
    
    def __name__(self):
        return "MLP"
    
    def define_model_with_layers(self, prior_observations, forecasting_step):
        model = Sequential()
        
        layer_1 = Dense(100, activation='relu', input_dim=prior_observations, name="1-input_layer")
        model.add(layer_1)
        layer_1_weights = layer_1.get_weights()

        layer_2 = Dense(forecasting_step, name="2-forecasting_layer")
        model.add(layer_2)
        fs_ws = layer_2.get_weights()
        
        model.compile(optimizer='adam', loss='mse')
        return model, model.summary()
    
    def fit_model(self, model_to_fit, X_train, y_train):
        
        model_to_fit.fit(X_train, y_train, epochs=2000, verbose=0)
        
        return model_to_fit
    
    def predict(self, fitted_model, x_input):
        yhat = fitted_model.predict(x_input)
        return yhat[0]

In [8]:
mlp_model = MLP()
model, model_defined = mlp_model.define_model_with_layers(prior_observations, forecasting_step)
model, model_defined

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 1-input_layer (Dense)       (None, 100)               400       
                                                                 
 2-forecasting_layer (Dense)  (None, 1)                101       
                                                                 
Total params: 501
Trainable params: 501
Non-trainable params: 0
_________________________________________________________________


2023-06-16 22:15:45.708656: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(<keras.engine.sequential.Sequential at 0x145489eb0>, None)

In [9]:
model_fitted = mlp_model.fit_model(model, train_x, train_y)

In [10]:
model_fitted

<keras.engine.sequential.Sequential at 0x145489eb0>

In [11]:
model_prediction = mlp_model.predict(model_fitted, x_input)



In [12]:
model_prediction

array([406.0638], dtype=float32)

In [13]:
ytrue = [393.54]

mse = math.sqrt(mean_squared_error(ytrue, model_prediction))
mse

12.523812255859355