In [10]:
import numpy as np
from model.ASU import ASU, LiteTCN
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os

# stocks_data = np.load('/Users/pujanmaharjan/uni adelaide/research project/DeepTrader/src/data/DJIA/stocks_data.npy')

# num_records (time_ids) / batch, window_length, stocks, features
# print('stock_data dimension ', stocks_data.shape)

class CustomStockDataset(Dataset):
    def __init__(self, stocks_data):
        self.stocks_data = stocks_data

    def __len__(self):
        return len(self.stocks_data)

    def __getitem__(self, idx):
        # print('stocks data shape ', self.stocks_data.shape)
        row_data = self.stocks_data[idx]
        # print('row_data shape ', row_data.shape)
        # print('row_data', row_data)
        features = row_data[0][:,1:]
        # print('features ', features)
        targets = row_data[0][:,0]
        # print('targets ', targets)
        return features, targets

# '/Users/pujanmaharjan/uni adelaide/research project/DeepTrader/src/data/DJIA/stocks_data.npy
def run_tcn(stocks_data_np_file_path, stock_ids, tcn_output_file_path, number_of_output_features = 2):
    stocks_data = np.load(stocks_data_np_file_path)
    stock_dataset = CustomStockDataset(stocks_data)
    stock_dataloader = DataLoader(stock_dataset, batch_size=3, shuffle=False)

    # num_records (time_ids), window_length (batch), stocks, features
    num_stocks = stocks_data.shape[2]
    num_features = stocks_data.shape[3]
    stocks_data_torch = torch.from_numpy(stocks_data).float()

    print('stocks_data_torch ', stocks_data_torch.shape)
    # print(' 957 time_ids, window length = 4, stocks = 2, features = 8, shape is 9 because of 1 target value')

    # number of features - target
    in_features_count = stocks_data.shape[3] - 1
    tcn = LiteTCN(in_features=in_features_count,
                hidden_size=10,
                num_layers=3,
                output_size=num_stocks * number_of_output_features).float()

    all_output = []
    for x, y in stock_dataloader:
        # print('shape of x ', x.shape)
        # print('batch / time_ids, stocks, features')
        if len(x) == 1:
            # print('x has single row')
            continue
        tcn_output = tcn.forward(x.float())
        # print('tcn output ', tcn_output.shape)
        tcn_output_np = tcn_output.detach().numpy()
        for i in range(tcn_output_np.shape[0]):
            tcn_op_val = tcn_output_np[i]
            all_output.append(tcn_op_val)


    all_output_list = []
    # stock_ids = [0,1,2]
    counter = 0
    for a in all_output:
        counter += 1
        row_value = {'time_id': counter}
        for stock_id in stock_ids:
            for i in range(0, len(a), number_of_output_features):
                for j in range(number_of_output_features):
                    stock = 'stock_' + str(stock_id) + '_x_' + str(j)
                    row_data = a[ i + j]
                    if type(row_data) is np.ndarray:
                        row_data= row_data[0]
                    row_value[stock] = row_data

        all_output_list.append(row_value)

    all_output_df = pd.DataFrame(all_output_list)
    all_output_df.to_csv(tcn_output_file_path, index=False)
    return all_output_df

def get_target_data(stock_ids):
    train_path = '/Users/pujanmaharjan/uni adelaide/uofa_research_project/datasets/optiver-realized-volatility-prediction/train.csv'
    train_data = pd.read_csv(train_path)
    return train_data[train_data['stock_id'].isin(stock_ids)]

def get_stocks_data_targets(stock_ids):
    train_data_stocks = get_target_data(stock_ids)
    stock_data_new = []
    time_ids = train_data_stocks['time_id'].unique()
    for t in time_ids:
        stock_data_row = {'time_id': t}
        for stock_id in stock_ids:
            stock_data_column_key = 'stock_' + str(stock_id)
            stock_data_row[stock_data_column_key] = train_data_stocks[(train_data_stocks['stock_id'] == stock_id) & (train_data_stocks['time_id'] == t)]['target'].values[0]

        stock_data_new.append(stock_data_row)

    return pd.DataFrame(stock_data_new)

def prepare_stock_data_tcn(tcn_data_path, stock_ids, stock_data_targets):
    tcn_target = pd.read_csv(tcn_data_path)
    for stock_id in stock_ids:
        target_data_column_key = 'stock_' + str(stock_id)
        tcn_column_key = target_data_column_key + '_y'
        tcn_target[tcn_column_key] = stock_data_targets[target_data_column_key]

    return tcn_target



In [3]:
data_dir = '/Users/pujanmaharjan/uni adelaide/research project/realized-volatility/data/'


In [18]:
def prepare_tcn_data_with_targets(stock_ids, file_prefix, stock_data_np_file_path):
    stock_tcn_path = os.path.join(data_dir, file_prefix + '_stock_data_tcn.csv')
    stock_tcn = run_tcn(stock_data_np_file_path, stock_ids, stock_tcn_path)
    stock_data_tcn_targets = get_stocks_data_targets(stock_ids)
    stock_tcn_with_targets = prepare_stock_data_tcn(stock_tcn_path, stock_ids, stock_data_tcn_targets)
    print(file_prefix, 'stock_tcn_with_targets ', stock_tcn_with_targets.shape)
    stock_tcn_target_path = os.path.join(data_dir, file_prefix + '_stock_data_tcn_targets.csv')
    stock_tcn_with_targets.to_csv(stock_tcn_target_path, index=False)
    return stock_tcn_with_targets

In [20]:
stock_ids_similar_rv = [0,7,11,16]
similar_rv_tcn_targets = prepare_tcn_data_with_targets(stock_ids_similar_rv, 'similar', 
    os.path.join(data_dir, 'stock_multi_dimensional_similar_rv.npy'))
similar_rv_tcn_targets.head()

stocks_data_torch  torch.Size([3829, 1, 4, 15])
similar stock_tcn_with_targets  (3828, 13)


Unnamed: 0,time_id,stock_0_x_0,stock_0_x_1,stock_7_x_0,stock_7_x_1,stock_11_x_0,stock_11_x_1,stock_16_x_0,stock_16_x_1,stock_0_y,stock_7_y,stock_11_y,stock_16_y
0,1,0.270186,0.21178,0.270186,0.21178,0.270186,0.21178,0.270186,0.21178,0.004136,0.003624,0.005918,0.005485
1,2,0.45295,0.489015,0.45295,0.489015,0.45295,0.489015,0.45295,0.489015,0.001445,0.002458,0.001634,0.003172
2,3,0.4538,0.354311,0.4538,0.354311,0.4538,0.354311,0.4538,0.354311,0.002168,0.002178,0.003923,0.002653
3,4,0.284175,0.540102,0.284175,0.540102,0.284175,0.540102,0.284175,0.540102,0.002195,0.002149,0.003581,0.001897
4,5,0.570389,0.444956,0.570389,0.444956,0.570389,0.444956,0.570389,0.444956,0.001747,0.002203,0.00315,0.002184


In [21]:
stock_ids_dissimilar_rv = [0,3,9,10]
dissimilar_rv_tcn_targets = prepare_tcn_data_with_targets(stock_ids_dissimilar_rv, 'dissimilar', 
    os.path.join(data_dir, 'stock_multi_dimensional_dissimilar_rv.npy'))
dissimilar_rv_tcn_targets.head()

stocks_data_torch  torch.Size([3829, 1, 4, 15])
dissimilar stock_tcn_with_targets  (3828, 13)


Unnamed: 0,time_id,stock_0_x_0,stock_0_x_1,stock_3_x_0,stock_3_x_1,stock_9_x_0,stock_9_x_1,stock_10_x_0,stock_10_x_1,stock_0_y,stock_3_y,stock_9_y,stock_10_y
0,1,0.52052,0.71612,0.52052,0.71612,0.52052,0.71612,0.52052,0.71612,0.004136,0.0053,0.007291,0.005707
1,2,0.544387,0.679593,0.544387,0.679593,0.544387,0.679593,0.544387,0.679593,0.001445,0.002774,0.002529,0.002352
2,3,0.450084,0.647569,0.450084,0.647569,0.450084,0.647569,0.450084,0.647569,0.002168,0.002986,0.003299,0.002363
3,4,0.470801,0.704983,0.470801,0.704983,0.470801,0.704983,0.470801,0.704983,0.002195,0.004437,0.003696,0.002341
4,5,0.50203,0.627292,0.50203,0.627292,0.50203,0.627292,0.50203,0.627292,0.001747,0.003408,0.003689,0.002007
