In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Visualization
from sklearn.preprocessing import MinMaxScaler
import torch # Library for implementing Deep Neural Network
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
from copy import deepcopy as dc
import math
# import ray
# from ray import tune
# from ray.tune.schedulers import ASHAScheduler

random.seed(42)

# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
# Loading the Apple.Inc Stock Data
import yfinance as yf
from datetime import date, timedelta, datetime

end_date = date.today().strftime("%Y-%m-%d") #end date for our data retrieval will be current date
start_date = '1990-01-01' # Beginning date for our historical data retrieval

df = yf.download('AAPL', start=start_date, end=end_date)# Function used to fetch the data

data = df[["Close"]]
data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
1990-01-02,0.332589
1990-01-03,0.334821
1990-01-04,0.335938
1990-01-05,0.337054
1990-01-08,0.339286
...,...
2024-06-12,213.070007
2024-06-13,214.240005
2024-06-14,212.490005
2024-06-17,216.669998


In [None]:
def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)

    for i in range(1, n_steps+1):
        df[f'Close(t-{i})'] = df['Close'].shift(i)

    df.dropna(inplace=True)

    return df

lookback = 50
shifted_df = prepare_dataframe_for_lstm(data, lookback)

shifted_df

Unnamed: 0_level_0,Close,Close(t-1),Close(t-2),Close(t-3),Close(t-4),Close(t-5),Close(t-6),Close(t-7),Close(t-8),Close(t-9),...,Close(t-41),Close(t-42),Close(t-43),Close(t-44),Close(t-45),Close(t-46),Close(t-47),Close(t-48),Close(t-49),Close(t-50)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-03-14,0.330357,0.329241,0.327009,0.329241,0.328125,0.315848,0.314732,0.308036,0.301339,0.305804,...,0.305804,0.308036,0.308036,0.321429,0.335938,0.339286,0.337054,0.335938,0.334821,0.332589
1990-03-15,0.328125,0.330357,0.329241,0.327009,0.329241,0.328125,0.315848,0.314732,0.308036,0.301339,...,0.311384,0.305804,0.308036,0.308036,0.321429,0.335938,0.339286,0.337054,0.335938,0.334821
1990-03-16,0.359375,0.328125,0.330357,0.329241,0.327009,0.329241,0.328125,0.315848,0.314732,0.308036,...,0.296875,0.311384,0.305804,0.308036,0.308036,0.321429,0.335938,0.339286,0.337054,0.335938
1990-03-19,0.378348,0.359375,0.328125,0.330357,0.329241,0.327009,0.329241,0.328125,0.315848,0.314732,...,0.289063,0.296875,0.311384,0.305804,0.308036,0.308036,0.321429,0.335938,0.339286,0.337054
1990-03-20,0.369420,0.378348,0.359375,0.328125,0.330357,0.329241,0.327009,0.329241,0.328125,0.315848,...,0.305804,0.289063,0.296875,0.311384,0.305804,0.308036,0.308036,0.321429,0.335938,0.339286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-12,213.070007,207.149994,193.119995,196.889999,194.479996,195.869995,194.350006,194.029999,192.250000,191.289993,...,172.690002,176.550003,175.039993,167.779999,169.669998,168.449997,169.580002,168.820007,169.649994,168.839996
2024-06-13,214.240005,213.070007,207.149994,193.119995,196.889999,194.479996,195.869995,194.350006,194.029999,192.250000,...,169.380005,172.690002,176.550003,175.039993,167.779999,169.669998,168.449997,169.580002,168.820007,169.649994
2024-06-14,212.490005,214.240005,213.070007,207.149994,193.119995,196.889999,194.479996,195.869995,194.350006,194.029999,...,168.000000,169.380005,172.690002,176.550003,175.039993,167.779999,169.669998,168.449997,169.580002,168.820007
2024-06-17,216.669998,212.490005,214.240005,213.070007,207.149994,193.119995,196.889999,194.479996,195.869995,194.350006,...,167.039993,168.000000,169.380005,172.690002,176.550003,175.039993,167.779999,169.669998,168.449997,169.580002


In [None]:
shifted_df_as_numpy = shifted_df.to_numpy()

scaler = MinMaxScaler(feature_range=(-1, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df_as_numpy)

shifted_df_as_np

X = shifted_df_as_np[:, 1:]
y = shifted_df_as_np[:, 0]

train_split = int(len(X) * 0.8)

X_train = X[:train_split]
X_test = X[train_split:]

y_train = y[:train_split]
y_test = y[train_split:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6905, 50), (1727, 50), (6905,), (1727,))

In [None]:
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_train shape: (6905, 50, 1)
X_test shape: (1727, 50, 1)


((6905, 50, 1), (1727, 50, 1), (6905, 1), (1727, 1))

In [None]:
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([6905, 50, 1]),
 torch.Size([1727, 50, 1]),
 torch.Size([6905, 1]),
 torch.Size([1727, 1]))

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_dataset

<__main__.TimeSeriesDataset at 0x7916c2f32bc0>

In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, dim_model: int):
        super().__init__()
        self.linear = nn.Linear(1, dim_model)

    def forward(self, x):
        # Assuming x is (batch, seq_len, 1)
        return self.linear(x) * math.sqrt(self.linear.out_features)

class PositionalEncoding(nn.Module):
    def __init__(self, dim_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, dim_model)
        pos = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0) / dim_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.shape[1], :]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, dim_model: int, h: int, dropout: float):
        super().__init__()
        self.h = h
        self.d_k = dim_model // h
        self.w_q = nn.Linear(dim_model, dim_model, bias=False)
        self.w_k = nn.Linear(dim_model, dim_model, bias=False)
        self.w_v = nn.Linear(dim_model, dim_model, bias=False)
        self.w_o = nn.Linear(dim_model, dim_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    def attention(self, query, key, value, mask=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Ensure mask dimensions match scores dimensions
            mask = mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(mask == 0, float('-inf'))
        p_attn = torch.softmax(scores, dim=-1)
        if self.dropout is not None:
            p_attn = self.dropout(p_attn)
        return torch.matmul(p_attn, value), p_attn

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        x, self.attn = self.attention(q, k, v, mask)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        return self.w_o(x)

class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps=1e-6):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):
    def __init__(self, dim_model: int, dim_ff: int, dropout: float):
        super().__init__()
        self.linear_1 = nn.Linear(dim_model, dim_ff)
        self.linear_2 = nn.Linear(dim_ff, dim_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class ResidualConnection(nn.Module):
    def __init__(self, features: int, dropout: float):
        super().__init__()
        self.norm = LayerNormalization(features)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):
    def __init__(self, features: int, multi_head_attention: MultiHeadAttention, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.multi_head_attention = multi_head_attention
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, mask):
        x = self.residual_connections[0](x, lambda x: self.multi_head_attention(x, x, x, mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList, features: int):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttention, cross_attention_block: MultiHeadAttention, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block

    # def forward(self, x, encoder_output, src_mask, tgt_mask):
    def forward(self, x, encoder_output, src_mask):
        # x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList, features: int):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    # def forward(self, x, encoder_output, src_mask, tgt_mask):
    def forward(self, x, encoder_output, src_mask):
        for layer in self.layers:
            # x = layer(x, encoder_output, src_mask, tgt_mask)
            x = layer(x, encoder_output, src_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model, output_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, output_size)

    def forward(self, x) -> None:
        return self.proj(x)

class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbedding, tgt_embed: InputEmbedding, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    # def encode(self, src, src_mask):
    #     src = self.src_embed(src)
    #     src = self.src_pos(src)
    #     return self.encoder(src, src_mask)

    # def decode(self, tgt, encoder_output, src_mask, tgt_mask):
    #     tgt = self.tgt_embed(tgt)
    #     tgt = self.tgt_pos(tgt)
    #     return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    # def forward(self, src, src_mask, tgt, tgt_mask):
    def forward(self, src, src_mask, tgt):
      src = self.src_embed(src)
      src = self.src_pos(src)
      encoder_output = self.encoder(src, src_mask)

      # Reshape tgt to (batch, 1, 1) before passing to tgt_embed
      tgt = tgt.unsqueeze(1).unsqueeze(2)  # Add two extra dimensions

      tgt = self.tgt_embed(tgt)
      # tgt = self.tgt_pos(tgt)
      # output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)
      output = self.decoder(tgt, encoder_output, src_mask)

      # output = output.view(output.size(0), -1) # Reshape output to (batch_size, dim_model) before projection

      return self.projection_layer(output)

def build_transformer(seq_len: int, dim_model: int=512, N: int=6, h: int=8, dropout: float=0.1, dim_ff: int=2048, output_size: int=1) -> Transformer:
    src_embed = InputEmbedding(dim_model)
    tgt_embed = InputEmbedding(dim_model)

    encoder_blocks = []
    decoder_blocks = []

    for i in range(N):
        multi_head_attention = MultiHeadAttention(dim_model, h, dropout)
        feed_forward_block = FeedForwardBlock(dim_model, dim_ff, dropout)
        encoder_blocks.append(EncoderBlock(dim_model, multi_head_attention, feed_forward_block, dropout))

    encoder = Encoder(nn.ModuleList(encoder_blocks), dim_model)

    for i in range(N):
        multi_head_attention = MultiHeadAttention(dim_model, h, dropout)
        cross_attention_block = MultiHeadAttention(dim_model, h, dropout)
        feed_forward_block = FeedForwardBlock(dim_model, dim_ff, dropout)
        decoder_blocks.append(DecoderBlock(dim_model, multi_head_attention, cross_attention_block, feed_forward_block, dropout))

    decoder = Decoder(nn.ModuleList(decoder_blocks), dim_model)

    src_pos = PositionalEncoding(dim_model, seq_len, dropout)
    tgt_pos = PositionalEncoding(dim_model, seq_len, dropout)

    projection_layer = ProjectionLayer(dim_model, output_size)

    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer


In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0.01)

In [None]:
model = build_transformer(seq_len=50)
model.to(device)
model

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

learning_rate = 0.000001
loss_function = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

test_loader

<torch.utils.data.dataloader.DataLoader at 0x7916569f8e50>

In [None]:
def generate_square_subsequent_mask(sz):
    """Generates an upper-triangular matrix of -inf, with zeros on the diagonal."""
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# Example usage in the transformer model
def train_one_epoch():
    model.train()
    print(f'Epoch: {epoch + 1}')
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        # print(x_batch.size(1))
        # break

        # Normalize input data if not already done
        x_batch = (x_batch - x_batch.mean(dim=0)) / x_batch.std(dim=0)

        # # Generate source and target masks
        src_mask = generate_square_subsequent_mask(x_batch.size(1)).to(device)
        # # Ensure y_batch has a sequence length of at least 2
        # if y_batch.size(1) > 1:
        #     tgt_mask = generate_square_subsequent_mask(y_batch[:, :-1].size(1)).to(device)  # Adjust mask for sliced target
        # else:
        #     continue  # Skip this batch if target sequence is too short

        # Ensure masks have the correct dimensions
        src_mask = src_mask.unsqueeze(0).unsqueeze(1)
        # if y_batch.size(1) > 1:
        #     tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1)

        # Forward pass through the model
        optimizer.zero_grad()
        # if y_batch.size(1) > 1:
        output = model(x_batch, src_mask, y_batch[:, 0])

        # Clamp outputs to avoid extreme values
        output = torch.clamp(output, min=-1e9, max=1e9)

        # Calculate loss
        loss = loss_function(output, y_batch[:, 0])

        if torch.isnan(loss):
            print("NaN loss detected")
            continue

        running_loss += loss.item()

        # Backward pass and optimization
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Check for NaN in gradients
        for name, param in model.named_parameters():
            if param.grad is not None and torch.isnan(param.grad).any():
                print(f"NaN found in gradients of {name}")

        optimizer.step()

        if batch_index % 100 == 99:  # print every 100 batches
            print(output)
            avg_loss_across_batches = running_loss / 100
            print('Batch {0}, Loss: {1:.3f}'.format(batch_index + 1, avg_loss_across_batches))
            running_loss = 0.0
    print()

def validate_one_epoch():
    model.eval()
    running_loss = 0.0

    for batch_index, batch in enumerate(test_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)

        # Generate source and target masks
        src_mask = generate_square_subsequent_mask(x_batch.size(1)).to(device)

        # Handle cases where y_batch has a sequence length of 1 or less
        # if y_batch.size(1) > 1:
        #     tgt_mask = generate_square_subsequent_mask(y_batch[:, :-1].size(1)).to(device)  # Adjust mask for sliced target
        # else:
        #     print("Warning: Skipping batch with target sequence length <= 1") # Print a warning
        #     continue  # Skip this batch

        # Ensure masks have the correct dimensions
        src_mask = src_mask.unsqueeze(0).unsqueeze(1)
        # if y_batch.size(1) > 1:
        #     tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1)

        with torch.no_grad():
            # Forward pass through the model
            # if y_batch.size(1) > 1: # Only perform forward pass if target sequence is long enough
            output = model(x_batch, src_mask, y_batch[:, 0])

            # Calculate loss
            loss = loss_function(output, y_batch[:, 0])
            running_loss += loss.item()

    # Handle the case where all batches were skipped
    if len(test_loader) > 0:
        avg_loss_across_batches = running_loss / len(test_loader)
        print('Val Loss: {0:.3f}'.format(avg_loss_across_batches))
    else:
        print("Warning: No batches processed in validation due to short target sequences.")
    print('***************************************************')
    print()

In [None]:
num_epochs = 50
for epoch in range(num_epochs):
    train_one_epoch()
    validate_one_epoch()

    # Check for NaN in model parameters after each epoch
    for name, param in model.named_parameters():
        if torch.isnan(param).any():
            print(f"NaN found in {name}")

Epoch: 1
NaN loss detected
NaN loss detected
NaN loss detected


  return F.mse_loss(input, target, reduction=self.reduction)


NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss detected
NaN loss d

KeyboardInterrupt: 

In [None]:
test_predictions = model(X_test.to(device)).detach().cpu().numpy().flatten()

dummies = np.zeros((X_test.shape[0], lookback+3))
dummies[:, 0] = test_predictions
dummies = scaler.inverse_transform(dummies)

test_predictions = dc(dummies[:, 0])
test_predictions

dummies = np.zeros((X_test.shape[0], lookback+3))
dummies[:, 0] = y_test.flatten()
dummies = scaler.inverse_transform(dummies)

new_y_test = dc(dummies[:, 0])
new_y_test

plt.plot(new_y_test, label='Actual Close')
plt.plot(test_predictions, label='Predicted Close')
plt.xlabel('Day')
plt.ylabel('Close')
plt.legend()
plt.show()