# Library Imports & Project Setup

In [10]:
import os
import numpy as np
import pandas as pd
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [2]:
class Colours:
    HEADER = '\033[95m'     # Miscellaneous
    BLUE = '\033[94m'       # Hyperparameters
    GREEN = '\033[92m'      # Accuracies
    RED = '\033[91m'        # Loss
    ENDC = '\033[0m'        # End Colours
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [3]:
# Set up GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {Colours.HEADER}{device}{Colours.ENDC}")

Using device: [95mcuda[0m


In [4]:
# Set up Optuna verbosity to see detailed output for every trial
optuna.logging.set_verbosity(optuna.logging.INFO)

# Data Loading & Preprocessing

In [5]:
# Load the training and testing datasets
train_data = pd.read_csv('Google_Stock_Price_Train.csv')
test_data = pd.read_csv('Google_Stock_Price_Test.csv')

In [6]:
# Convert columns to numeric by removing commas explicitly
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
    # Convert to string, remove commas, and convert to float
    train_data[col] = train_data[col].astype(str).str.replace(',', '').astype(float)
    test_data[col] = test_data[col].astype(str).str.replace(',', '').astype(float)

# Drop the 'Date' column for training, as it's not needed
train_data = train_data.drop(['Date'], axis=1)
test_data = test_data.drop(['Date'], axis=1)

# Normalize the features to bring them in the range [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_train_data = scaler.fit_transform(train_data)
scaled_test_data = scaler.transform(test_data)

# Convert scaled_train_data to DataFrame for easier handling
scaled_train_df = pd.DataFrame(scaled_train_data, columns=train_data.columns)

# Split the training data into training and validation sets
train_df, val_df = train_test_split(scaled_train_df, test_size=0.2, shuffle=False)

print("\nTraining Data Sample after Scaling:")
print(train_df.head())

print("\nValidation Data Sample after Scaling:")
print(val_df.head())


Training Data Sample after Scaling:
       Open      High       Low     Close    Volume
0  0.085814  0.096401  0.090449  0.237573  0.295258
1  0.097012  0.098344  0.098235  0.241514  0.229936
2  0.094334  0.092517  0.094086  0.228781  0.263612
3  0.091562  0.088819  0.088006  0.216419  0.216179
4  0.079842  0.076718  0.061070  0.178548  0.467797

Validation Data Sample after Scaling:
          Open      High       Low     Close    Volume
1006  0.862936  0.864381  0.860055  0.345410  0.130753
1007  0.869354  0.879209  0.874034  0.346430  0.077805
1008  0.838753  0.870207  0.855622  0.347863  0.077657
1009  0.839330  0.853997  0.836945  0.324118  0.118374
1010  0.841450  0.844156  0.825466  0.307691  0.097837


# Sequence Creation

In [9]:
# Function to create sequences using PyTorch tensors
def create_sequences_torch(data, sequence_length, device):
    X = []
    y = []

    for i in range(len(data) - sequence_length):
        X.append(data.iloc[i:i + sequence_length].values)   # Features over the sequence length
        y.append(data['Close'].iloc[i + sequence_length])   # Target value (Close price)

    # Convert lists to NumPy arrays and then to PyTorch tensors
    X = torch.tensor(np.array(X), dtype=torch.float32).to(device)
    y = torch.tensor(np.array(y), dtype=torch.float32).to(device)

    return X, y

# Define the sequence length for the training and validation sets
sequence_length = 30    # Using 30 days to predict the next day's price

# Create sequences for training and validation using the defined sequence length
X_train, y_train = create_sequences_torch(train_df, sequence_length, device)
X_val, y_val = create_sequences_torch(val_df, sequence_length, device)

# Use a smaller sequence length for the testing set (10 days)
test_sequence_length = 10
X_test, y_test = create_sequences_torch(pd.DataFrame(scaled_test_data, columns=train_data.columns), test_sequence_length, device)

# Display the shape of the created sequences
print(f"Training set: X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Validation set: X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"Testing set: X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Training set: X_train shape: torch.Size([976, 30, 5]), y_train shape: torch.Size([976])
Validation set: X_val shape: torch.Size([222, 30, 5]), y_val shape: torch.Size([222])
Testing set: X_test shape: torch.Size([10, 10, 5]), y_test shape: torch.Size([10])


# Recurrent Neural Network (RNN) Architecture Implementations

## Vanilla RNN

## Long Short-Term Memory (LSTM)

## Gated Recurrent Unit (GRU)

# Hyperparameter Tuning

## Vanilla RNN Tuning

## LSTM Tuning

## GRU Tuning