In [1]:
import os
from pathlib import Path

from torch.utils.data import DataLoader, SubsetRandomSampler, Dataset
import numpy as np
import torch.nn as nn

import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm

In [2]:
class LSTMModel(nn.Module):
    def __init__(self, num_features=16, hidden_size=64, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length, num_features)
        out, _ = self.lstm(x)
        # Use the last hidden state
        out = out[:, -1, :]
        out = self.fc(out)
        return out.squeeze()

In [3]:
DATA_DIR = Path(os.getcwd()).joinpath('data')
seq_features = pd.read_parquet(DATA_DIR.joinpath('train_seq_features.parquet'))

In [16]:
train_targets = pd.read_csv(DATA_DIR.joinpath('train.csv'))
train_targets

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
...,...,...,...
428927,126,32751,0.003461
428928,126,32753,0.003113
428929,126,32758,0.004070
428930,126,32763,0.003357


In [None]:
from sklearn.preprocessing import StandardScaler

features_target = torch.tensor(train_targets[train_targets['stock_id'].isin([0, 1, 2, 3, 4])]['target'].to_numpy())
features_array = (
    seq_features
    .sort_values(['time_id', 'seconds_in_bucket'])
    .drop(columns=['time_id', 'stock_id'])
    .to_numpy()
)

scaler = StandardScaler()
features_array = scaler.fit_transform(features_array)
features_tensor = torch.tensor(features_array.reshape((-1, 600, 16)))

In [25]:
features_tensor.shape

torch.Size([19150, 600, 16])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_valid = train_test_split(features_tensor, test_size=0.2)

In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(features_tensor, features_target, test_size=0.2)

In [51]:
def rmspe(y_true, y_pred):
    # Adding a small epsilon to y_true to prevent division by zero
    epsilon = 1e-8
    return torch.sqrt(torch.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

batch_size = 64

class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.to(torch.float32)
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(StockDataset(X_train, y_train), batch_size=batch_size)
val_loader = DataLoader(StockDataset(X_valid, y_valid), batch_size=batch_size)

In [53]:
device = torch.device('cpu')
num_epochs = 3

model = LSTMModel(num_features=16).to(device)
criterion = rmspe
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, targets in train_loader:
        inputs = torch.nan_to_num(inputs, nan=0.0).to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * inputs.size(0)
    
    # Validation Phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = torch.nan_to_num(inputs, nan=0.0).to(device)
            targets = targets.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
    
    # Compute average losses
    train_loss = train_loss / len(train_loader.dataset)
    val_loss = val_loss / len(val_loader.dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')


Epoch 1/3, Training Loss: 1.0032, Validation Loss: 1.0009
Epoch 2/3, Training Loss: 1.0008, Validation Loss: 1.0008
Epoch 3/3, Training Loss: 1.0007, Validation Loss: 1.0007
