<a href="https://colab.research.google.com/github/DavisRayM/msft-stock-prediction/blob/main/msft-prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4

Author: Davis Muro

For CPSC 5610

Microsoft Stock Prediction

In [456]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [457]:
np.random.seed(25)
tf.random.set_seed(25)

In [458]:
df = pd.read_csv('data/MSFT.csv')
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,11/7/2016,59.779999,60.52,59.779999,60.419998,55.902321,31664800
1,11/8/2016,60.549999,60.779999,60.150002,60.470001,55.948589,22935400
2,11/9/2016,60.0,60.59,59.200001,60.169998,55.671009,49632500
3,11/10/2016,60.48,60.490002,57.630001,58.700001,54.310928,57822400
4,11/11/2016,58.23,59.119999,58.009998,59.02,54.607002,38767800


## Preprocessing

In [459]:
# Convert `Date` to `DateTime`
df['Date'] = pd.to_datetime(df['Date'])

# Sort dataframe by `Date` (Ascending)
df.sort_values(by='Date', inplace=True, ascending=True)

# Drop Adj Close
df.drop(columns=['Adj Close'], inplace=True)

df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2016-11-07,59.779999,60.52,59.779999,60.419998,31664800
1,2016-11-08,60.549999,60.779999,60.150002,60.470001,22935400
2,2016-11-09,60.0,60.59,59.200001,60.169998,49632500
3,2016-11-10,60.48,60.490002,57.630001,58.700001,57822400
4,2016-11-11,58.23,59.119999,58.009998,59.02,38767800


In [460]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical columns using MinMaxScaler
scaler = MinMaxScaler()
df[['Close', 'High', 'Low', 'Open', 'Volume']] = scaler.fit_transform(df[['Close', 'High', 'Low', 'Open', 'Volume']])

df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2016-11-07,0.00553,0.005148,0.009021,0.008264,0.233481
1,2016-11-08,0.008277,0.006078,0.010356,0.008444,0.149396
2,2016-11-09,0.006315,0.005398,0.006928,0.007366,0.406553
3,2016-11-10,0.008028,0.005041,0.001263,0.002084,0.485441
4,2016-11-11,0.0,0.000143,0.002634,0.003234,0.3019


In [461]:
# Set `Date` as index; It's unique
if (df.duplicated(subset=['Date']).sum() > 0):
    assert False, "Duplicate dates found"

df.set_index('Date', inplace=True)
df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-11-07,0.00553,0.005148,0.009021,0.008264,0.233481
2016-11-08,0.008277,0.006078,0.010356,0.008444,0.149396
2016-11-09,0.006315,0.005398,0.006928,0.007366,0.406553
2016-11-10,0.008028,0.005041,0.001263,0.002084,0.485441
2016-11-11,0.0,0.000143,0.002634,0.003234,0.3019


In [462]:
df.shape

(1259, 5)

## Windowing

In [463]:
train_size = df.shape[0] * 80 // 100
test_size = df.shape[0] - train_size
train_size, test_size

(1007, 252)

In [464]:
# Use a sliding window of N days (e.g., 20 days) to predict the next dayʼs values.
train_data = tf.data.Dataset.from_tensor_slices(df.values[:train_size])
test_data = tf.data.Dataset.from_tensor_slices(df.values[:test_size])
n_steps = 20
window_length = n_steps + 1
train_data = train_data.window(window_length, shift =1, drop_remainder=True )
test_data = test_data.window(window_length, shift =1, drop_remainder=True )

In [465]:
for window in train_data.take(2):
    t = list(window.as_numpy_iterator())
    print(len(t))
    print(t)

21
[array([0.00553018, 0.00514818, 0.00902071, 0.00826387, 0.23348119]), array([0.00827743, 0.00607771, 0.01035579, 0.00844353, 0.14939629]), array([0.00631511, 0.00539844, 0.00692791, 0.00736562, 0.40655291]), array([0.00802769, 0.00504094, 0.00126291, 0.00208394, 0.48544114]), array([0.00000000e+00, 1.42994526e-04, 2.63404403e-03, 3.23369140e-03,
       3.01899987e-01]), array([0.00281861, 0.        , 0.        , 0.        , 0.32656466]), array([0.00035679, 0.0014658 , 0.00375262, 0.00269474, 0.27431574]), array([0.00253318, 0.00207357, 0.00552068, 0.00549728, 0.19175083]), array([0.00777794, 0.00668549, 0.00970629, 0.00905433, 0.23798818]), array([0.00909804, 0.00736476, 0.01089702, 0.00801236, 0.19515877]), array([0.00809904, 0.00675699, 0.01133001, 0.00984479, 0.11777511]), array([0.00981162, 0.00779377, 0.01273725, 0.01077896, 0.15200956]), array([0.00991865, 0.00722175, 0.01071661, 0.00819202, 0.13893071]), array([0.00738547, 0.00518393, 0.01028362, 0.0086591 , 0.00947826]), arr

In [466]:
for window in test_data.take(2):
    t = list(window.as_numpy_iterator())
    print(len(t))
    print(t)

21
[array([0.00553018, 0.00514818, 0.00902071, 0.00826387, 0.23348119]), array([0.00827743, 0.00607771, 0.01035579, 0.00844353, 0.14939629]), array([0.00631511, 0.00539844, 0.00692791, 0.00736562, 0.40655291]), array([0.00802769, 0.00504094, 0.00126291, 0.00208394, 0.48544114]), array([0.00000000e+00, 1.42994526e-04, 2.63404403e-03, 3.23369140e-03,
       3.01899987e-01]), array([0.00281861, 0.        , 0.        , 0.        , 0.32656466]), array([0.00035679, 0.0014658 , 0.00375262, 0.00269474, 0.27431574]), array([0.00253318, 0.00207357, 0.00552068, 0.00549728, 0.19175083]), array([0.00777794, 0.00668549, 0.00970629, 0.00905433, 0.23798818]), array([0.00909804, 0.00736476, 0.01089702, 0.00801236, 0.19515877]), array([0.00809904, 0.00675699, 0.01133001, 0.00984479, 0.11777511]), array([0.00981162, 0.00779377, 0.01273725, 0.01077896, 0.15200956]), array([0.00991865, 0.00722175, 0.01071661, 0.00819202, 0.13893071]), array([0.00738547, 0.00518393, 0.01028362, 0.0086591 , 0.00947826]), arr

In [467]:
def show_dataset(dataset, n):
    dataset = dataset.prefetch(1)
    for tensor in dataset.take(n):
        print(tensor)
train_data = train_data.flat_map(lambda window: window.batch(window_length))
test_data = test_data.flat_map(lambda window: window.batch(window_length))

show_dataset(train_data, 2)

tf.Tensor(
[[5.53018034e-03 5.14818192e-03 9.02071110e-03 8.26386525e-03
  2.33481190e-01]
 [8.27743299e-03 6.07771248e-03 1.03557872e-02 8.44352535e-03
  1.49396291e-01]
 [6.31511323e-03 5.39844111e-03 6.92791334e-03 7.36561863e-03
  4.06552908e-01]
 [8.02768631e-03 5.04093513e-03 1.26290677e-03 2.08393933e-03
  4.85441139e-01]
 [0.00000000e+00 1.42994526e-04 2.63404403e-03 3.23369140e-03
  3.01899987e-01]
 [2.81860986e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
  3.26564660e-01]
 [3.56793194e-04 1.46580383e-03 3.75261943e-03 2.69473984e-03
  2.74315740e-01]
 [2.53317745e-03 2.07356900e-03 5.52068241e-03 5.49728005e-03
  1.91750830e-01]
 [7.77793607e-03 6.68549195e-03 9.70629236e-03 9.05432586e-03
  2.37988181e-01]
 [9.09804092e-03 7.36475975e-03 1.08970190e-02 8.01235619e-03
  1.95158766e-01]
 [8.09904352e-03 6.75699458e-03 1.13300095e-02 9.84479006e-03
  1.17775113e-01]
 [9.81161660e-03 7.79377193e-03 1.27372513e-02 1.07789594e-02
  1.52009555e-01]
 [9.91864529e-03 7.22175092e-

In [468]:
batch_size = 32
train_data = train_data.shuffle(10000).batch(batch_size)
test_data = test_data.shuffle(10000).batch(batch_size)
show_dataset(train_data, 2)

tf.Tensor(
[[[0.12519623 0.12974152 0.12859925 0.13308424 0.14550385]
  [0.13179676 0.13453218 0.13646531 0.13689279 0.16198196]
  [0.13422291 0.13256588 0.13632098 0.13541966 0.15468639]
  ...
  [0.14649636 0.14625861 0.1491304  0.1463064  0.25733193]
  [0.14646067 0.14936898 0.15115103 0.15331274 0.20449543]
  [0.15352505 0.15294411 0.15721295 0.15647456 0.19126054]]

 [[0.08873269 0.08701868 0.09258859 0.09011209 0.13039546]
  [0.08880406 0.08594616 0.09006278 0.08723772 0.14114712]
  [0.0862352  0.08405132 0.09009887 0.0877048  0.08562608]
  ...
  [0.09815184 0.09624253 0.10063505 0.09783702 0.14099011]
  [0.09704581 0.09577777 0.09832575 0.09546566 0.11443653]
  [0.09643928 0.10013941 0.09958864 0.1032265  0.4480126 ]]

 [[0.19669616 0.19538093 0.19903298 0.19851252 0.11266706]
  [0.19787356 0.19527365 0.19694017 0.19409313 0.12821565]
  [0.19252176 0.19527365 0.19643501 0.19793762 0.14203234]
  ...
  [0.18913228 0.18740838 0.17503789 0.17260709 0.51967365]
  [0.16811758 0.1782202

In [469]:
train_data = train_data.map(lambda window: (window[:, :-1, :], window[:, -1, :]))
test_data = test_data.map(lambda window: (window[:, :-1, :], window[:, -1, :]))
show_dataset(train_data, 2)

(<tf.Tensor: shape=(32, 20, 5), dtype=float64, numpy=
array([[[0.61024689, 0.60859459, 0.56895432, 0.5719316 , 0.49100769],
        [0.55969031, 0.56944691, 0.53370136, 0.56097298, 0.50318109],
        [0.52900669, 0.53966606, 0.52291257, 0.51933028, 0.43826078],
        ...,
        [0.53917511, 0.53980909, 0.53954677, 0.53585799, 0.16178835],
        [0.53339514, 0.54663755, 0.53857251, 0.54688848, 0.25432855],
        [0.55394605, 0.5538236 , 0.55582016, 0.55454156, 0.19007383]],

       [[0.28407308, 0.28411569, 0.28725552, 0.2884809 , 0.16307716],
        [0.28535751, 0.28468772, 0.28851845, 0.28434893, 0.13239899],
        [0.28657057, 0.28747632, 0.29126072, 0.2898462 , 0.07267342],
        ...,
        [0.28407308, 0.2836509 , 0.28718334, 0.28528314, 0.18864439],
        [0.28382331, 0.28236386, 0.2861009 , 0.28456456, 0.15351606],
        [0.27686599, 0.27750167, 0.28281735, 0.28100748, 0.089637  ]],

       [[0.44352072, 0.44303026, 0.44807679, 0.44808135, 0.14638136],
      

In [470]:
# Shape your input as (samples, timesteps, features) .
train_data = train_data.prefetch(1)
for X_batch, Y_batch in train_data.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 20, 5) (32, 5)


In [471]:
test_data = test_data.prefetch(1)
for X_batch, Y_batch in test_data.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 20, 5) (32, 5)


## Modelling

In [472]:
from tensorflow.keras.layers import BatchNormalization, Layer
from tensorflow.keras import activations

class BatchNormSimpleRNN(Layer):
    def __init__(self, units, return_sequences=False, **kwargs):
        super().__init__(**kwargs)
        self.simple_rnn = SimpleRNN(units, activation=None, return_sequences=return_sequences)
        self.batch_norm = BatchNormalization()
        self.activation = activations.tanh  # or any other activation you want

    def call(self, inputs, training=None):
        x = self.simple_rnn(inputs)
        x = self.batch_norm(x, training=training)
        x = self.activation(x)
        return x

In [473]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from typing import Tuple, List

def create_model(kind: str = "simple", input_shape: Tuple[int, int, int] = (n_steps, 5), hidden_units: List = [128, 64]):
  model = tf.keras.models.Sequential()

  if kind == "simple":
    model.add(BatchNormSimpleRNN(hidden_units[0], return_sequences=True, input_shape=input_shape))

    for units in hidden_units[1:-1]:
      model.add(BatchNormSimpleRNN(units, return_sequences=True))

    if len(hidden_units) > 1:
      model.add(BatchNormSimpleRNN(hidden_units[-1], return_sequences=False))
  else:
    assert False, "Unsupported kind: " + kind

  model.add(Dense(input_shape[1]))
  model.compile(loss="mse", optimizer="adam", metrics=["mae"])
  return model

In [474]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

In [None]:
simple = create_model(kind="simple", hidden_units=[128, 64])
simple.summary()

  super().__init__(**kwargs)


In [None]:
simple.fit(train_data, epochs=20, validation_data=test_data, callbacks=[early_stopping])

In [None]:
result = pd.DataFrame(simple.history.history)
result.head()

In [None]:
result[['mae', 'val_mae']].rolling(window=3).mean().plot(title="Mean Absolute Error vs Epoch")

In [None]:
result[['loss','val_loss']].plot(title="Loss vs Epoch")