In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import math
import random

from torchsummary import summary
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import TensorDataset
from torch.optim.lr_scheduler import LambdaLR
from torch import nn, optim, tensor, Tensor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [None]:
random.seed(52)
torch.manual_seed(52)
np.random.seed(52)

In [None]:
class CandlesDataset(Dataset):
    def __init__(self, root_dir: str, years: list, window_size: int, max_samples: int, device):
        self.device = device
        self.window_size = window_size
        self.tables_list = self.__get_correct_tables(root_dir, years)
        self.samples = self.__generate_samples(self.tables_list, max_samples)
        self.scaled_samples = self.__scale_samples()

    def __get_correct_tables(self, root_dir, years):
        """
        get paths of .csv with len > window_size 
        """
        tables_list = []
        tables_dirs = []
        for root, dirs, files in os.walk(root_dir):
            if dirs:
                tables_dirs = dirs
                break
        for table_dir in tables_dirs:
            if int(table_dir[-4:]) in years:
                full_dir = os.path.join(root_dir, table_dir)
                for root, dirs, files in os.walk(full_dir):
                    for file in files:
                        if file.lower().endswith('.csv'):
                            full_path = os.path.join(full_dir, file)
                            with open(full_path) as f:
                                if sum(1 for line in f) > self.window_size:
                                    tables_list.append(full_path)
        random.shuffle(tables_list)
        return tables_list

    def __generate_samples(self, tables_list, max_samples):
        samples = torch.empty((0, self.window_size + 1, 8)) #(n_blocks, window_size, n_features)
        for table in tqdm(tables_list, desc="tables done"):
            df = pd.read_csv(
                table,
                sep =";",
                names=["figi", "utc", "open", "close", "high", "low", "volume"],
                index_col=False
                )
            df['utc'] = pd.to_datetime(df['utc'], utc=True)

            #fill missing candles
            df = df.set_index('utc').resample('min').asfreq()
            df['volume'] = df['volume'].fillna(0)
            for col in ['figi', 'open', 'close', 'high', 'low']:
                df[col] = df[col].ffill()

            df.drop(labels=['figi'], axis=1, inplace=True)

            df = df.resample('5min').agg({ #form 5 minutes candles
                    'open': 'first',
                    'close': 'last',
                    'high': 'max',
                    'low': 'min',
                    'volume': 'sum'
            })

            #add time data
            df['hour'] = df.index.hour
            df['day_of_week'] = df.index.day_of_week
            df['minute'] = df.index.minute
            df = df[(df['day_of_week'] < 5)] #drop not tradeble days
            
            data = df.values
            if data.shape[0] == 0:
                continue
            windows = np.lib.stride_tricks.sliding_window_view(
                data, (self.window_size + 1, data.shape[1])
            )  #(n_blocks, window_size, n_features)
            tensor = torch.tensor(windows, dtype=torch.float32).squeeze(dim=1)
            samples = torch.vstack([samples, tensor])
            if samples.shape[0] > max_samples:
                break
        return samples

    def __scale_samples(self):
        stds = self.samples[:, :self.window_size-1, :-3].std(dim=1)
        valid_mask = (stds > 1e-7).all(dim=1)
        self.samples = self.samples[valid_mask]
        mean = self.samples[:, :(self.window_size-1), :-3].mean(dim=1, keepdim=True)
        std = self.samples[:, :(self.window_size-1), :-3].std(dim=1, keepdim=True)
        epsilon = 1e-7
        normalized_data = (self.samples[:, :, :-3] - mean) / (std + epsilon)
        normalized_data = torch.cat([normalized_data, self.samples[:, :, -3:]], dim=-1)
        normalized_data[..., -3] = normalized_data[..., -3] / 23
        normalized_data[..., -2] = normalized_data[..., -2] / 4 
        normalized_data[..., -1] = normalized_data[..., -1] / 59
        self.std = std
        self.mean = mean
        return normalized_data


    def __len__(self):
        return len(self.scaled_samples)

    def __getitem__(self, idx):
        feature = self.scaled_samples[idx][:-1, :]
        target = self.samples[idx][-1:, :5].squeeze(dim=0)
        target = ((target[1] / target[0]) > 1.001).float() # close > open more than commision
        return feature, target, self.std[idx], self.mean[idx]

In [None]:
import json

with open('model_config.json', 'r') as f:
    config = json.load(f)

heads = config['heads']
encoder_layers = config['encoder_layers']
d_model = config['d_model']
window_size = config['window_size']
batch_size = config['batch_size']
root_dir = config['data_dir']
max_samples_train = config['max_samples_train']
max_samples_val = config['max_samples_val']
years_train = config['years_train']
years_val = config['years_val']
max_epoch = config['max_epoch']
model_dir = config['model_path']
is_preload = config['is_preload']

num_workers = 4
model_path = os.path.join(model_dir, 'best.tar')

In [None]:
train_dataset = CandlesDataset(
    root_dir=root_dir, 
    years=years_train, 
    window_size=window_size,
    max_samples=max_samples_train, 
    device=device
    )

val_dataset = CandlesDataset(
    root_dir=root_dir, 
    years=years_val, 
    window_size=window_size,
    max_samples=max_samples_val, 
    device=device
    )

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers)

In [None]:
class TimeEncoder(nn.Module):
    def __init__(self, candles_features: int, time_features: int, d_model: int):
        super().__init__()
        self.input_proj = nn.Linear(candles_features, d_model)
        self.time2vec = nn.Sequential(
            nn.Linear(time_features, 32),
            nn.GELU(),
            nn.Linear(32, d_model)
        )
        self.learnable_pe = nn.Parameter(torch.randn(1, 5000, d_model))
        
    def forward(self, x):
        x, time_features = x[:, :, :5], x[:, :, 5:]
        t_emb = self.time2vec(time_features)
        x = self.input_proj(x)
        x = x + self.learnable_pe[:, :x.size(1), :] + t_emb
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, d_model: int, nhead: int, encoder_layers: int):
        super().__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers=encoder_layers
        )

    def forward(self, x):
        x = self.encoder(x).mean(dim=1)
        return x

In [None]:
class CandleTransformer(nn.Module):
    def __init__(self, heads: int, encoder_layers: int, d_model: int):
        super().__init__()
        self.time_enc = TimeEncoder(candles_features=5, time_features=3, d_model=d_model)
        self.transformer = Transformer(d_model=d_model, nhead=heads, encoder_layers=encoder_layers)
        self.out = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
            )

    def forward(self, x):
        x = self.time_enc(x)
        trans_out = self.transformer(x)
        out = self.out(trans_out)
        return out, torch.sigmoid(out)

In [None]:
targets = []
for i in tqdm(train_dataset):
    targets.append(int(i[1]))
weight = torch.tensor(targets.count(0) / targets.count(1)).float()

In [None]:
model = CandleTransformer(
    heads=heads,
    encoder_layers=encoder_layers, 
    d_model=d_model,
    ).to(device=device)

weights = torch.tensor([weight]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=100,
    T_mult=2,
    eta_min=1e-6 
)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)


In [None]:
train_losses = []
val_losses = []
all_f1 = []
all_accuracy = []
current_epoch = 0

if is_preload:
    checkpoint = torch.load(model_path, weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    current_epoch = checkpoint['epoch']
    train_losses = checkpoint['train_losses']
    val_losses = checkpoint['val_losses']
    all_f1 = checkpoint['all_f1']
    all_accuracy = checkpoint['all_accuracy']
    print('Preload model')

for epoch in range(current_epoch, max_epoch):
    model.train()
    running_loss = 0
    
    for features, targets, _, _  in tqdm(train_loader, desc=f"Train Epoch {epoch + 1}"):
        features, targets = features.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs, outputs_proba = model(features)
        loss = criterion(outputs.squeeze(dim=1), targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item() * features.size(0)
    train_loss = running_loss / len(train_dataset)

    model.eval()
    val_running_loss = 0
    all_targets = []
    all_preds = []
    for features, targets, _ , _ in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
        features, targets = features.to(device), targets.to(device)
        outputs, outputs_proba = model(features)
        val_running_loss += criterion(outputs.squeeze(dim=1), targets).item() * features.size(0)
        all_preds.append(outputs_proba.detach().cpu())
        all_targets.append(targets.detach().cpu())
    all_preds = torch.cat(all_preds)
    all_preds = (all_preds >= 0.5).int()
    all_targets = torch.cat(all_targets)
    val_loss = val_running_loss / len(val_dataset)

    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses,
        'all_f1': all_f1,
        'all_accuracy': all_accuracy
        }, os.path.join(model_dir, f'baseline_{epoch}.tar'))
    if len(val_losses) == 0 or val_loss < min(val_losses):
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_losses': train_losses,
            'val_losses': val_losses,
            'all_f1': all_f1,
            'all_accuracy': all_accuracy
            }, os.path.join(model_dir, f'best.tar'))
        

    f1 = f1_score(all_preds, all_targets)
    accuracy = accuracy_score(all_preds, all_targets)
    
    current_lr = optimizer.param_groups[0]['lr']
    print(f'LR: {current_lr:.4e}')
    print(f"Epoch {epoch+1}/{max_epoch}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    print(f"Val f1: {f1:.4f}")
    print(f"Val accuracy: {accuracy:.4f}\n")
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    all_f1.append(f1)
    all_accuracy.append(accuracy)

In [None]:
plt.plot(val_losses, '-o')
plt.xlabel('Epoch')
plt.ylabel('BCELoss')
plt.show()

In [None]:
print('Min validation loss: ', min(val_losses))

In [None]:
max_counter = 10
dataloader = DataLoader(val_dataset, batch_size=1, num_workers=num_workers, shuffle=True)
counter = 0
for features, targets, std, mean in dataloader:
    features, targets, std, mean = features.to(device), targets.to(device), std.to(device), mean.to(device)
    optimizer.zero_grad()
    outputs = model(features)[0].detach()
    print(f"Target: ", (targets).cpu())
    print(f"Output: ", (outputs >= 0.5).float().cpu())
    print()
    counter += 1
    if counter == max_counter:
        break