In [1]:
!nvidia-smi

Mon Jul 21 05:37:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   69C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install gdown



In [4]:
!if [ ! -e "stocks" ]; then \
  gdown --id '1AzIra9nVKVnXEg4RDi6Dtqtr9nZergzH' --output "stocks.zip" ;\
  unzip -q "stocks.zip" ;\
fi

!rm stocks.zip

Downloading...
From (original): https://drive.google.com/uc?id=1AzIra9nVKVnXEg4RDi6Dtqtr9nZergzH
From (redirected): https://drive.google.com/uc?id=1AzIra9nVKVnXEg4RDi6Dtqtr9nZergzH&confirm=t&uuid=5a7ed48f-770d-450a-84e6-3cd236f6ed42
To: /content/stocks.zip
100% 52.4M/52.4M [00:00<00:00, 70.1MB/s]


In [5]:
# 5 x 5

# !if [ ! -e "stock_dataset" ]; then \
#   gdown --id '1XtdsevhmWB_jAGR89QxupIXdlDDdPlt_' --output "stock_dataset.zip" ;\
#   unzip -q "stock_dataset.zip" ;\
# fi

# !rm stock_dataset.zip

In [6]:
# 20 x 6

# !if [ ! -e "stock_dataset" ]; then \
#   gdown --id '175wsxX4Wad3otXWHPnFuVlnvweeffP9N' --output "stock_dataset.zip" ;\
#   unzip -q "stock_dataset.zip" ;\
# fi

# !rm stock_dataset.zip

In [7]:
# 20 x 7

!if [ ! -e "stock_dataset" ]; then \
  gdown --id '1oEBl2rVCSKP0Qg13YhWHAfG5ZAdkYZnr' --output "stock_dataset.zip" ;\
  unzip -q "stock_dataset.zip" ;\
fi

!rm stock_dataset.zip

Downloading...
From (original): https://drive.google.com/uc?id=1oEBl2rVCSKP0Qg13YhWHAfG5ZAdkYZnr
From (redirected): https://drive.google.com/uc?id=1oEBl2rVCSKP0Qg13YhWHAfG5ZAdkYZnr&confirm=t&uuid=ff64373b-393e-494d-9367-5cdd9f4f5282
To: /content/stock_dataset.zip
100% 891M/891M [00:11<00:00, 76.5MB/s]


In [8]:
!if [ ! -e "stock_num.txt" ]; then \
  gdown --id '1uVUmGb5TG1XNPKfiipZZ8CZZvHkiz33_' --output "stock_num.txt" ;\
fi

Downloading...
From: https://drive.google.com/uc?id=1uVUmGb5TG1XNPKfiipZZ8CZZvHkiz33_
To: /content/stock_num.txt
100% 12.0k/12.0k [00:00<00:00, 29.0MB/s]


In [9]:
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset, Subset, SubsetRandomSampler

from transformers import get_scheduler
from accelerate import Accelerator, DistributedDataParallelKwargs, InitProcessGroupKwargs

from datetime import timedelta

# KFold
# KFOLD Reference: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-pytorch.md
from sklearn.model_selection import KFold

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

In [10]:
stock_num = list()
with open('stock_num.txt') as f:
    lines = f.readlines()
    for line in lines:
        stock_num.append(line.strip())

In [11]:
stock_num_mod = []
for i in range(len(stock_num)):
    path = 'stocks/' + stock_num[i] + '.csv'
    try:
        stock = pd.read_csv(path)
        if len(stock) >= 500:
            stock_num_mod.append(stock_num[i])
    except:
        print(path, " is empty and has been skipped.")

print(len(stock_num_mod))

1954


In [12]:
def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set))
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)
        with torch.no_grad():
            pred = model(x)
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds

In [13]:
import h5py

In [14]:
class My_Model(nn.Module):
    def __init__(self, feature_dim=5, d_model=128, nhead=8, num_layers=4, dropout=0.1):
        super(My_Model, self).__init__()
        self.embedding = nn.Linear(feature_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.classifier = nn.Sequential(
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Linear(32, 1),  # Binary classification
        )

    def forward(self, x):
        # x: [batch_size, seq_len, feature_dim]
        x = self.embedding(x)               # [batch_size, seq_len, d_model]
        x = self.positional_encoding(x)     # Add positional encoding
        x = self.transformer_encoder(x)     # [batch_size, seq_len, d_model]
        x = x.mean(dim=1)                   # Mean pooling over time
        logits = self.classifier(x)         # [batch_size, 1]
        return torch.sigmoid(logits)        # Output in [0, 1] range

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)  # [max_len, d_model]
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)  # even index
        pe[:, 1::2] = torch.cos(position * div_term)  # odd index
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [15]:
class StockDataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(np.asarray(y))
        self.x = torch.FloatTensor(np.asarray(x))

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [16]:
class HDF5Dataset(Dataset):
    def __init__(self, filepath):
        self.file = h5py.File(filepath, 'r')
        self.data = self.file['data']

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        # You can convert to torch.Tensor if needed
        return torch.tensor(self.data[idx], dtype=torch.float32)

In [17]:
X_train = h5py.File('stock_dataset/train_dataset.h5', 'r')['data']
Y_train = h5py.File('stock_dataset/train_groud_truth.h5', 'r')['data']
X_dev = h5py.File('stock_dataset/val_dataset.h5', 'r')['data']
Y_dev = h5py.File('stock_dataset/val_groud_truth.h5', 'r')['data']
X_test = h5py.File('stock_dataset/test_dataset.h5', 'r')['data']

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 42,      # Your seed number, you can pick your lucky number. :)
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 2000,     # Number of epochs.
    'batch_size': 64,
    'warmup_steps': 2000,
    'lr_schedule': 'cosine',
    'learning_rate': 2e-4,
    'weight_decay': 1e-6,
    'betas': (0.9, 0.99),
    'early_stop': 200,    # If model has not improved for this many consecutive epochs, stop training.
    'clip_grad_norm': 0.1,
    'k_folds': 5,
    # 'save_path': './models/model.ckpt'  # Your model will be saved here.
    'save_path': './models/'
}

In [19]:
# Set seed for reproducibility
same_seed(config['seed'])
#train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

# Print out the data size.
print(f"""train_data size: {X_train.shape}
valid_data size: {X_dev.shape}
test_data size: {X_test.shape}""")

# Select features
#x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {X_train.shape[1:]}')

train_dataset, valid_dataset, test_dataset = StockDataset(X_train, Y_train), \
                                            StockDataset(X_dev, Y_dev), \
                                            StockDataset(X_test)

# For KFold
dataset = ConcatDataset([train_dataset, valid_dataset])

# Pytorch data loader loads pytorch dataset into batches.
# Without KFold
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (781600, 20, 7)
valid_data size: (195400, 20, 7)
test_data size: (1954, 20, 7)
number of features: (20, 7)


In [20]:
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)

init_process_kwargs = InitProcessGroupKwargs(timeout=timedelta(minutes=90))

accelerator = Accelerator(
        mixed_precision = 'no',
        kwargs_handlers=[ddp_kwargs, init_process_kwargs]
)

In [21]:
def trainer(fold, train_loader, valid_loader, model, config, optimizer, lr_scheduler, device):

    #criterion = nn.MSELoss(size_average=True)
    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    #sigmoid = nn.Sigmoid()
    # Define your optimization algorithm.
    # Check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # L2 regularization (optimizer(weight decay...) or implement by your self).
    #optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
    #optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9, weight_decay=config['weight_decay'])


    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []
        correct = 0
        val_correct = 0
        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            x, y = x.to(device), y.to(device)   # Move your data to device.
            pred = model(x)
            #print(f'pred size : {pred.shape}')
            #print(f'y size : {y.shape}')
            loss = criterion(pred, y)
            accelerator.backward(loss)                     # Compute gradient(backpropagation).
            accelerator.clip_grad_norm_(model.parameters(), config['clip_grad_norm'])
            optimizer.step()                    # Update parameters.
            lr_scheduler.step()
            optimizer.zero_grad()
            accelerator.wait_for_everyone()
            step += 1
            loss_record.append(loss.detach().item())

            # For MSELoss
            #correct += (pred == y.T).float().sum()

            # For BCEWithLogitsLoss
            #print("y = ", y)
            #print("pred = ", pred)
            pred = pred > 0.5
            correct += (pred == y).float().sum()
            #print("correct = ", correct)

            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)
        #print('correct = ', correct)
        accuracy = 100 * correct / X_train.shape[0]

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

                # For MSELoss
                #val_correct += (pred == y.T).float().sum()

                # For BCEWithLogitsLoss
                #print("y = ", y)
                #print("pred = ", pred)
                pred = pred > 0.5
                val_correct += (pred == y).float().sum()

            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record)/len(loss_record)
        val_accuracy = 100 * val_correct / X_dev.shape[0]

        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train Acc: {accuracy:.4f}, Valid Acc: {val_accuracy:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'] + fold + '_model.ckpt') # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            print('\nBest Loss = ' + str(best_loss))
            return

In [None]:
model = My_Model(feature_dim=X_train.shape[-1]).to(device) # put your model and data on the same computation device.
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], betas = config['betas'], weight_decay=config['weight_decay'])
lr_scheduler = get_scheduler(
        config['lr_schedule'],
        optimizer=optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=config['n_epochs']*len(train_loader),
)
model, optimizer, train_loader, valid_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, valid_loader, lr_scheduler)
trainer('none', train_loader, valid_loader, model, config, optimizer, lr_scheduler, device)

Epoch [1/2000]: 100%|██████████| 12213/12213 [03:32<00:00, 57.53it/s, loss=0.693]


Epoch [1/2000]: Train loss: 0.6935, Valid loss: 0.6931
Epoch [1/2000]: Train Acc: 55.8756, Valid Acc: 55.6919
Saving model with loss 0.693...


Epoch [2/2000]: 100%|██████████| 12213/12213 [03:21<00:00, 60.69it/s, loss=0.693]


Epoch [2/2000]: Train loss: 0.6931, Valid loss: 0.6931
Epoch [2/2000]: Train Acc: 55.8753, Valid Acc: 55.6919


Epoch [3/2000]:  50%|████▉     | 6072/12213 [01:43<03:15, 31.38it/s, loss=0.693]

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'raise'])
        for i, p in enumerate(preds):
            writer.writerow([stock_num_mod[i], p])

model = My_Model(feature_dim=X_train.shape[-1]).to(device)
model.load_state_dict(torch.load(config['save_path'] + 'none_model.ckpt'))
preds = predict(test_loader, model, device)
preds = preds > 0.5
save_pred(preds, 'none_pred.csv')

In [None]:
# KFold testing
model = My_Model(feature_dim=X_train.shape[-1]).to(device) # put your model and data on the same computation device.
kfold = KFold(n_splits=config['k_folds'], shuffle=True)

for fold, (train_ids, valid_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    train_subsampler = SubsetRandomSampler(train_ids)
    valid_subsampler = SubsetRandomSampler(valid_ids)
    train_loader = DataLoader(dataset, batch_size=config['batch_size'], sampler=train_subsampler,
                              num_workers=0, pin_memory=True)
    valid_loader = DataLoader(dataset, batch_size=config['batch_size'], sampler=valid_subsampler,
                              num_workers=0, pin_memory=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], betas = config['betas'], weight_decay=config['weight_decay'])
    lr_scheduler = get_scheduler(
                config['lr_schedule'],
                optimizer=optimizer,
                num_warmup_steps=config['warmup_steps'],
                num_training_steps=config['n_epochs']*len(train_loader),
    )

    model, optimizer, train_loader, valid_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, valid_loader, lr_scheduler)

    trainer('Fold_' + str(fold),train_loader, valid_loader, model, config, optimizer, lr_scheduler, device)

    model.load_state_dict(torch.load(config['save_path'] + 'Fold_' + str(fold) + '_model.ckpt'))
    preds = predict(test_loader, model, device)
    preds = preds > 0.5
    save_pred(preds, 'Fold_' + str(fold) + '_pred.csv')