## Loading Data

In [53]:
from pandas_datareader import data as pdr

import yfinance as yf
yf.pdr_override()

df = pdr.get_data_yahoo("IBM", start="2019-01-01", end="2024-01-01")

[*********************100%%**********************]  1 of 1 completed


### Are these features enough? 
Maybe we want to consier deriving other indicators (moving averages, RSI, MACD)

In [54]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,107.084129,110.879539,106.778206,110.143402,85.522606,4434935
2019-01-03,109.493309,109.827919,107.734222,107.944550,83.815262,4546648
2019-01-04,109.856598,112.323135,109.407265,112.160614,87.088905,4683779
2019-01-07,112.332695,113.604210,111.539200,112.954109,87.705017,3923755
2019-01-08,114.397705,115.267685,113.747612,114.560226,88.952118,4982726
...,...,...,...,...,...,...
2023-12-22,161.100006,162.410004,161.000000,162.139999,160.675140,2439800
2023-12-26,162.229996,163.309998,162.050003,163.210007,161.735489,1772400
2023-12-27,163.139999,163.639999,162.679993,163.460007,161.983231,3234600
2023-12-28,163.960007,163.960007,163.399994,163.750000,162.270599,2071300


## Normalize the Data

In [55]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df) 

## Set up Sequences for RNN 
The way I think it makes sense right now is to have the features of the network be all available feature columns (Open, High, Low, Close, Adj Close, and Volume) and then as the label we are targeting be the next day's Abj Close. 

Once this is working the way that it needs to we can then set up a label for predicitng the previous day's. I think that it should be the same work flow. For a feature datapoint we are just trying to predict Adj Close -1 vs Adj Close +1

In [60]:
import numpy as np

def create_sequences(data, seq_length):
    xs = []
    ys = []

    for i in range(len(data)-seq_length-1):
        x = data[i:(i+seq_length)]
        y = data[i+seq_length][4] 
        xs.append(x)
        ys.append(y)

    return np.array(xs), np.array(ys)

seq_length = 5
X, y = create_sequences(scaled_features, seq_length)

In [73]:
print(X.shape,y.shape)

(1252, 5, 6) (1252,)


## Split the Data

In [62]:
split_fraction = 0.8
split = int(split_fraction * len(X))

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [66]:
print(f'X Training set: {X_train.shape}\ny Training set: {y_train.shape}')

X Training set: (1001, 5, 6)
y Training set: (1001,)


## Convert to Torch Tensors and set up Dataloader

In [69]:
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 64  

train_data = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

test_data = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [68]:
train_data[1]

(tensor([[-1.9102, -2.0093, -1.9426, -2.0510, -2.0295, -0.1346],
         [-1.8767, -1.7769, -1.7908, -1.6642, -1.8162, -0.0893],
         [-1.6489, -1.6576, -1.5973, -1.5913, -1.7761, -0.3401],
         [-1.4588, -1.5026, -1.3968, -1.4440, -1.6948,  0.0093],
         [-1.3488, -1.4287, -1.3196, -1.3685, -1.6532, -0.3807]]),
 tensor(-1.6000))

## Setting up the RNN Class

In [13]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

## Define instance of model/training parameters

In [74]:
import torch.optim as optim

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
    print("Running on GPU")
else: print("Running on CPU")

#Model Define
model = SimpleRNN(input_size=6, hidden_size=20, output_size=1).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
model.train()

Running on CPU


SimpleRNN(
  (rnn): RNN(6, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=1, bias=True)
)

## Training

In [75]:
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs, labels.unsqueeze(-1))
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.5706
Epoch [2/100], Loss: 0.2600
Epoch [3/100], Loss: 0.1263
Epoch [4/100], Loss: 0.0891
Epoch [5/100], Loss: 0.0919
Epoch [6/100], Loss: 0.0867
Epoch [7/100], Loss: 0.0434
Epoch [8/100], Loss: 0.0218
Epoch [9/100], Loss: 0.0434
Epoch [10/100], Loss: 0.0243
Epoch [11/100], Loss: 0.0278
Epoch [12/100], Loss: 0.0394
Epoch [13/100], Loss: 0.0325
Epoch [14/100], Loss: 0.0227
Epoch [15/100], Loss: 0.0168
Epoch [16/100], Loss: 0.0468
Epoch [17/100], Loss: 0.0193
Epoch [18/100], Loss: 0.0237
Epoch [19/100], Loss: 0.0408
Epoch [20/100], Loss: 0.0193
Epoch [21/100], Loss: 0.0090
Epoch [22/100], Loss: 0.0155
Epoch [23/100], Loss: 0.0175
Epoch [24/100], Loss: 0.0070
Epoch [25/100], Loss: 0.0286
Epoch [26/100], Loss: 0.0136
Epoch [27/100], Loss: 0.0123
Epoch [28/100], Loss: 0.0168
Epoch [29/100], Loss: 0.0124
Epoch [30/100], Loss: 0.0093
Epoch [31/100], Loss: 0.0176
Epoch [32/100], Loss: 0.0228
Epoch [33/100], Loss: 0.0114
Epoch [34/100], Loss: 0.0184
Epoch [35/100], Loss: 0

## Evaluation

In [77]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

model.eval()

actuals = []
predictions = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)

        actuals.extend(labels.cpu().numpy())
        predictions.extend(outputs.cpu().numpy())

actuals = np.array(actuals)
predictions = np.array(predictions)

mae = mean_absolute_error(actuals, predictions)
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error: {mae:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

Mean Absolute Error: 0.1522
Mean Squared Error: 0.0532
Root Mean Squared Error: 0.2307


### Metrics? 
Not sure what we want to use for evaluating the model.

In [82]:
actuals

array([1.1864595 , 1.1760012 , 1.2166018 , 1.2811924 , 1.1895367 ,
       1.3488573 , 1.3396318 , 1.416524  , 1.4448221 , 1.4626597 ,
       1.4835755 , 1.440514  , 1.1464753 , 1.1593944 , 1.1950716 ,
       1.2356724 , 1.2129112 , 1.1680045 , 0.77984935, 0.77615774,
       0.83213675, 0.797073  , 0.81921816, 0.8991866 , 0.93302053,
       0.8862685 , 0.8653547 , 0.8739658 , 0.8378497 , 0.9530494 ,
       1.0620219 , 0.97857946, 1.0028647 , 0.91568696, 0.91693276,
       0.7108184 , 0.664739  , 0.65352947, 0.6398304 , 0.63484925,
       0.56074727, 0.4916274 , 0.5377068 , 0.58191955, 0.61616707,
       0.49536374, 0.48290992, 0.3652191 , 0.32100734, 0.32910293,
       0.2711911 , 0.18588105, 0.27430454, 0.21141207, 0.35151997,
       0.3907501 , 0.23382914, 0.19148557, 0.3110441 , 0.56136966,
       0.5632383 , 0.5862793 , 0.55576617, 0.6722102 , 0.73261213,
       0.70396817, 0.73759425, 0.63547164, 0.6684739 , 0.63048905,
       0.5134216 , 0.4735686 , 0.48851395, 0.46858695, 0.46609

In [83]:
predictions

array([[ 1.1240916 ],
       [ 1.1663694 ],
       [ 1.129085  ],
       [ 1.1938637 ],
       [ 1.2558901 ],
       [ 1.1665697 ],
       [ 1.307838  ],
       [ 1.323579  ],
       [ 1.3587172 ],
       [ 1.3849747 ],
       [ 1.4227203 ],
       [ 1.4146271 ],
       [ 1.4175854 ],
       [ 1.165248  ],
       [ 1.1585833 ],
       [ 1.1709232 ],
       [ 1.2343991 ],
       [ 1.1923956 ],
       [ 1.1511716 ],
       [ 0.7219816 ],
       [ 0.78020006],
       [ 0.8470139 ],
       [ 0.8804187 ],
       [ 0.8327758 ],
       [ 0.9089691 ],
       [ 0.9451005 ],
       [ 0.87511075],
       [ 0.8691942 ],
       [ 0.87501425],
       [ 0.7949423 ],
       [ 0.90815806],
       [ 1.0413985 ],
       [ 0.9622765 ],
       [ 0.9707163 ],
       [ 0.8961257 ],
       [ 0.89811033],
       [ 0.7156938 ],
       [ 0.66235584],
       [ 0.62851906],
       [ 0.6173052 ],
       [ 0.63694733],
       [ 0.5706381 ],
       [ 0.5025795 ],
       [ 0.5190586 ],
       [ 0.5842765 ],
       [ 0