In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [3]:
DATA_DIR = Path(os.getcwd()).joinpath('data')
train_targets = pd.read_csv(DATA_DIR.joinpath('train.csv'))
train_targets

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
...,...,...,...
428927,126,32751,0.003461
428928,126,32753,0.003113
428929,126,32758,0.004070
428930,126,32763,0.003357


In [4]:
stock_ids = train_targets.stock_id.unique()

In [5]:
feature_means = np.load(DATA_DIR.joinpath('seq_features').joinpath('feature_means.npy'))
feature_stds = np.load(DATA_DIR.joinpath('seq_features').joinpath('feature_stds.npy'))

feature_means, feature_stds

(array([2.99500000e+02, 1.00641095e+04, 1.90405617e+02, 6.63335529e+00,
        8.36687157e+02, 1.00000680e+00, 1.00000553e+00, 2.72614582e-03,
        2.17380546e-03, 2.88276171e-03, 2.29133757e-03, 2.79193802e-03,
        2.22625789e-03, 2.95122026e-03, 2.34570845e-03, 2.35567682e-03]),
 array([1.73204878e+02, 1.31414606e+04, 3.79309559e+02, 7.66123567e+00,
        6.11827585e+03, 3.68989322e-03, 3.70745216e-03, 2.67012408e-03,
        2.11603016e-03, 2.87719295e-03, 2.27091025e-03, 2.34281279e-03,
        1.85399888e-03, 2.53002282e-03, 1.99385344e-03, 2.26015967e-03]))

In [7]:
from tqdm.notebook import tqdm

In [8]:
data = None
device = torch.device('cpu')
for stock_id in tqdm(stock_ids):
    stock_df = pd.read_parquet(DATA_DIR.joinpath('seq_features').joinpath(f'stock_{stock_id}_seq.parquet'))

    stock_array = (
        stock_df
        .sort_values(['time_id', 'seconds_in_bucket'])
        .drop(columns=['time_id', 'stock_id'])
        .to_numpy()
    )
    stock_array_scaled = ((stock_array - feature_means) / feature_stds).astype(np.float32)

    stock_tensor = torch.tensor(stock_array_scaled.reshape((-1, 600, 16))).to(device)
    if data is None:
        data = stock_tensor
    else:
        data = torch.cat([data, stock_tensor], dim=0).to(device)

  0%|          | 0/112 [00:00<?, ?it/s]

In [9]:
# Calculate memory usage in bytes
memory_bytes = data.element_size() * data.nelement()

# Convert to megabytes (MB) and gigabytes (GB)
memory_mb = memory_bytes / (1024 ** 2)
memory_gb = memory_bytes / (1024 ** 3)

print(f"Memory usage of tensor: {memory_bytes} bytes")
print(f"Memory usage of tensor: {memory_mb:.2f} MB")
print(f"Memory usage of tensor: {memory_gb:.2f} GB")

Memory usage of tensor: 16470988800 bytes
Memory usage of tensor: 15707.96 MB
Memory usage of tensor: 15.34 GB


In [10]:
from torch.utils.data import DataLoader, SubsetRandomSampler, Dataset
import numpy as np
import torch.nn as nn


In [11]:
class StockDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data  # Features tensor
        self.targets = targets  # Targets tensor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]  # Return a tuple (features, target)

In [12]:
dataset = StockDataset(data, torch.tensor(train_targets['target']))

In [13]:
seed = 42
np.random.seed(seed)

num_samples = data.shape[0]
indices = np.random.permutation(num_samples)
split_idx = int(num_samples * 0.8)
train_indices, val_indices = indices[:split_idx], indices[split_idx:]

# Create DataLoaders using SubsetRandomSampler
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

batch_size = 64
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

In [14]:
class LSTMModel(nn.Module):
    def __init__(self, num_features=16, hidden_size=64, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length, num_features)
        out, _ = self.lstm(x)
        # Use the last hidden state
        out = out[:, -1, :]
        out = self.fc(out)
        return out.squeeze()

In [15]:
device = torch.device('cpu')

In [16]:
def rmspe(y_true, y_pred):
    # Adding a small epsilon to y_true to prevent division by zero
    epsilon = 1e-8
    return torch.sqrt(torch.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [17]:
torch.cuda.empty_cache()

In [None]:
device = torch.device('cpu')
num_epochs = 4

model = LSTMModel(num_features=16).to(device)
criterion = rmspe
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, targets in train_loader:
        inputs = torch.nan_to_num(inputs, nan=0.0).to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * inputs.size(0)
    
    # Validation Phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = torch.nan_to_num(inputs, nan=0.0).to(device)
            targets = targets.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
    
    # Compute average losses
    train_loss = train_loss / len(train_loader.dataset)
    val_loss = val_loss / len(val_loader.dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')


Epoch 1/4, Training Loss: 0.8024, Validation Loss: 0.1979
