In [206]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [207]:
import pandas as pd
import numpy as np
import os
import csv

from torchsummary import summary
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import TensorDataset
from torch import nn, optim, tensor, Tensor
from sklearn.preprocessing import RobustScaler, StandardScaler

In [208]:
class CandelsDataset(Dataset):
    def __init__(self, root_dir: str, years: list, window_size: int, max_samples: int, device):
        self.device = device
        self.window_size = window_size
        self.tables_list = self.__get_correct_tables(root_dir, years)
        self.samples = self.__generate_samples(self.tables_list, max_samples)

    def __get_correct_tables(self, root_dir, years):
        """
        get paths of .csv with len > window_size 
        """
        tables_list = []
        for root, dirs, files in os.walk(root_dir):
            if dirs:
                tables_dirs = dirs
                break
        for table_dir in tables_dirs:
            if int(table_dir[-4:]) in years:
                full_dir = os.path.join(root_dir, table_dir)
                for root, dirs, files in os.walk(full_dir):
                    for file in files:
                        if file.lower().endswith('.csv'):
                            full_path = os.path.join(full_dir, file)
                            with open(full_path) as f:
                                if sum(1 for line in f) > self.window_size:
                                    tables_list.append(full_path)
        return tables_list

    def __generate_samples(self, tables_list, max_samples):
        samples = torch.empty((0, self.window_size + 1, 8)) #(n_blocks, window_size, n_features)
        for table in tqdm(tables_list, desc="tables done"):
            df = pd.read_csv(
                table,
                sep =";",
                names=["figi", "utc", "open", "close", "high", "low", "volume"],
                index_col=False
                )
            df['utc'] = pd.to_datetime(df['utc'], utc=True)

            #fill missing candles
            df = df.set_index('utc').resample('min').asfreq()
            df['volume'] = df['volume'].fillna(0)
            for col in ['figi', 'open', 'high', 'low', 'close']:
                df[col] = df[col].ffill()

            #add time data
            df['hour'] = df.index.hour
            df['day_of_week'] = df.index.day_of_week
            df['minute'] = df.index.minute
            df.drop(labels=['figi'], axis=1, inplace=True)

            df = df[(df['day_of_week'] < 5)] #drop not tradeble days

            data = df.values
            if data.shape[0] == 0:
                continue
            scaler = RobustScaler()
            data = scaler.fit_transform(data)
            windows = np.lib.stride_tricks.sliding_window_view(
                data, (self.window_size + 1, data.shape[1])
            )  #(n_blocks, window_size, n_features)
            tensor = torch.tensor(windows, dtype=torch.float32).squeeze(dim=1)
            samples = torch.vstack([samples, tensor])
            if samples.shape[0] > max_samples:
                break
        return samples


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx].to(device)
        return sample[:-1, :5], sample[:-1, 5:], sample[-1:, :5].squeeze(dim=0) #OHLCV, time features, target

In [209]:
class TimeEncoder(nn.Module):
    def __init__(self, candles_features, time_features, d_model):
        super().__init__()
        self.input_proj = nn.Linear(candles_features, d_model)
        self.time2vec = nn.Sequential(
            nn.Linear(time_features, 128),
            nn.GELU(),
            nn.Linear(128, d_model)
        )
        self.learnable_pe = nn.Parameter(torch.randn(1, 5000, d_model))
        
    def forward(self, x, time_features):
        t_emb = self.time2vec(time_features)
        x = self.input_proj(x)
        x = x + self.learnable_pe[:, :x.size(1), :] + t_emb
        return x

In [210]:
class Transformer(nn.Module):
    def __init__(self, d_model=128, nhead=4):
        super().__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers=4
        )

    def forward(self, x):
        x = self.encoder(x)
        return x

In [211]:
class CandleTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.time_enc = TimeEncoder(candles_features=5, time_features=3, d_model=1024)
        self.transformer = Transformer(d_model=1024, nhead=8)
        self.out = nn.Linear(1024, 5)

    def forward(self, prices, time_feats):
        """
        # prices: [B, 180, 5] (OHLCV)
        # indicators: [B, 180, 4]
        # time_feats: [B, 180, 3]
        """
        time_encoded = self.time_enc(prices, time_feats)
        trans_out = self.transformer(time_encoded)
        trans_out = trans_out.mean(dim=1)
        out = self.out(trans_out)
        return out

In [212]:
def direction_accuracy(pred_close, true_close):
    direction_pred = torch.sign(pred_close[:, -1] - pred_close[:, 0])
    direction_true = torch.sign(true_close[:, -1] - true_close[:, 0])
    return (direction_pred == direction_true).float().mean()

In [213]:
batch_size = 64
num_workers = 1
window_size = 180
root_dir = "market_data/unzip_data"
max_epoch = 20

In [214]:
train_dataset = CandelsDataset(
    root_dir=root_dir, 
    years=[2023], 
    window_size=window_size,
    max_samples=1000, 
    device=device
    )

val_dataset = CandelsDataset(
    root_dir=root_dir, 
    years=[2024], 
    window_size=window_size,
    max_samples=200, 
    device=device
    )

tables done:   0%|          | 3/1444 [00:00<00:18, 76.85it/s]
tables done:   0%|          | 0/1448 [00:00<?, ?it/s]


In [215]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=True)

In [216]:
model = CandleTransformer().to(device=device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [205]:
for epoch in range(max_epoch):
        model.train()
        train_loss = 0
        train_metrics = {"dir_accuracy": []}
        test_metrics = {"dir_accuracy": []}
        
        for candles, time, targets in tqdm(train_loader, desc=f"Train Epoch {epoch + 1}"):
            optimizer.zero_grad()
            outputs = model(candles, time)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * candles.size(0)
        
        model.eval()
        val_loss = 0
        all_targets = []
        all_preds = []
        with torch.no_grad():
            for candles, time, targets in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
                outputs = model(candles, time)
                val_loss += criterion(outputs, targets).item() * candles.size(0)
                all_targets.extend(targets.cpu().numpy())
                all_preds.extend(outputs.cpu().numpy())
        dir_accuracy = direction_accuracy(outputs, targets)
        train_metrics["dir_accuracy"].append(dir_accuracy)
        
        print(f"Epoch {epoch+1}/{max_epoch}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}\n")
        print(f"Val direction Accuracy: {dir_accuracy:.4f}\n")
        print(train_metrics)

Train Epoch 1:   0%|          | 0/24 [00:00<?, ?it/s]

torch.Size([64, 5]) torch.Size([64, 5])


Train Epoch 1:   4%|▍         | 1/24 [00:08<03:26,  8.97s/it]

In [29]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)

In [31]:
for candles, time, targets in tqdm(train_loader, desc=f"Train Epoch {epoch + 1}"):
    optimizer.zero_grad()
    outputs = model(candles, time)
    loss = criterion(outputs, targets)
    print(outputs)
    print(targets)
    break

Train Epoch 20:   0%|          | 0/77530 [00:00<?, ?it/s]

tensor([[-0.0019,  0.0037, -0.0104, -0.0165,  0.9901]],
       grad_fn=<AddmmBackward0>)
tensor([[[ 0.1463,  0.1463,  0.0800,  0.2000, -0.2192]]])



