In [1]:
import os
import torch
import torch.utils.data as data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from models import AlphaNetV3
from dataset import StockDataset
from utils import AverageMeter
from data import TrainValData, TimeSeriesData
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [60]:
def preprocess_y(df_y):
    df_y_mean = df_y.mean(axis=1)
    df_y_std = df_y.std(axis=1)
    max_5_sigma = df_y_mean + 5 * df_y_std
    min_5_sigma = df_y_mean - 5 * df_y_std
    
    for row in range(df_y.shape[0]):
        df_y.iloc[row][df_y.iloc[row]>max_5_sigma.values[row]] = np.nan
        df_y.iloc[row][df_y.iloc[row]<min_5_sigma.values[row]] = np.nan

    df_y_q_33 = df_y.quantile(q=0.33, axis=1)
    df_y_q_66 = df_y.quantile(q=0.67, axis=1)

    for row in range(df_y.shape[0]):
    # row = 700
        rank_0 = (df_y.iloc[row] <= df_y_q_33.values[row])
        rank_1 = ((df_y.iloc[row]>df_y_q_33.values[row]) & (df_y.iloc[row]<=df_y_q_66.values[row]))
        rank_2 = (df_y.iloc[row]>df_y_q_66.values[row])

        df_y.iloc[row][rank_0] = 0
        df_y.iloc[row][rank_1] = 1
        df_y.iloc[row][rank_2] = 2


def to_one_hot(y, num_cls=3):
    one_hot_label = np.empty((len(y), num_cls))
    one_hot_label[:] = np.NaN
    y = y.astype(np.int64)
    for row in range(len(one_hot_label)):
        if y[row] >= 0 and y[row] < num_cls:
            label = np.zeros(num_cls)
            label[y[row]] = 1
            one_hot_label[row] = label
    return one_hot_label

In [61]:
data_path = '/work/bd/summer2022/insample/datacache'
dailydata = os.path.join(data_path, 'dailydata')
labeldata = os.path.join(data_path, 'labeldata')

df_close = pd.read_parquet(os.path.join(dailydata, 'close.parquet'))
df_high = pd.read_parquet(os.path.join(dailydata, 'high.parquet'))
df_low = pd.read_parquet(os.path.join(dailydata, 'low.parquet'))
df_open = pd.read_parquet(os.path.join(dailydata, 'open.parquet'))
df_tvrvalue = pd.read_parquet(os.path.join(dailydata, 'tvrvalue.parquet'))
df_tvrvolume = pd.read_parquet(os.path.join(dailydata, 'tvrvolume.parquet'))

df_y = pd.read_parquet(os.path.join(labeldata, 'Y_0.parquet'))
preprocess_y(df_y=df_y)

features = [df_open, df_high, df_low, df_tvrvalue, df_tvrvolume, df_close]
stock_data_list = []

stocks = df_open.columns.values
for stock in stocks:
    one_stock_features = []
    for feature in features:
        one_stock_features.append(feature[stock].values[:-2].reshape(-1, 1))
    stock_np_features = np.concatenate(one_stock_features, axis=1)
    dates = feature.index.values[:-2]
    # labels = np.zeros(len(df_y[stock].values[1:]), 3)
    # labels[:, ]
    # labels = df_y[stock].values[1:]
    labels = to_one_hot(df_y[stock].values[1:])
    # print(stock_np_features.shape, dates.shape, labels.shape)
    stock_data_list.append(TimeSeriesData(dates=dates, data=stock_np_features, labels=labels))

train_val_data = TrainValData(time_series_list=stock_data_list, train_length=800, validate_length=150, history_length=10, train_val_gap=10, sample_step=1)
train, val, dates_info = train_val_data.get(20180102, order='by_date')

val_dataset = StockDataset(stock_data=val[0], stock_label=val[1])
val_loader = data.DataLoader(val_dataset, batch_size=4096, shuffle=False)

In [62]:
train_dataset = StockDataset(stock_data=train[0], stock_label=train[1])
train_loader = data.DataLoader(train_dataset, batch_size=4096, shuffle=False)

In [63]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = AlphaNetV3(feat_dim=6, hidden_dim=30, num_layers=2, dropout=0.0, num_classes=3)
ckpt = torch.load('results/baseline_pce_tm_4.5_lr0.001/best_model.ckpt', map_location='cpu')
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss(reduction='mean')

In [64]:
model.eval()

losses = AverageMeter()
acces = AverageMeter()
p_outputs = []
p_ys = []
preds = []
labels = []
with torch.no_grad():
    for i, (feat, label) in enumerate(val_loader):
        feat = feat.to(device).to(torch.float32)
        label = label.to(device).to(torch.float32)

        output = model(feat)
        loss = criterion(output, label)
        prediction = output.argmax(dim=1)
        labelindex = label.argmax(dim=1)
        acc = accuracy_score(labelindex.cpu().numpy(), prediction.cpu().numpy())
        acces.update(acc, feat.size(0))

        p_outputs.append(output.cpu().numpy())
        p_ys.append(label.cpu().numpy())
        
        preds.append(prediction.cpu().numpy())
        labels.append(labelindex.cpu().numpy())

        losses.update(loss.item(), feat.size(0))
ret = {'val_loss': losses.avg}
print(ret)

{'val_loss': 1.0853035265463058}


In [65]:
# calculate correlation
# p_output = np.concatenate(p_outputs, axis=0)
# p_y = np.concatenate(p_ys, axis=0)
# print(len(p_y))
# corr = 0
# for i in range(len(p_y)):
#     corr += np.corrcoef(p_output[i, :], p_y[i, :])[0, 1]
# correlation = corr/len(p_y)
# print(f"correlation: {correlation}")


In [66]:
total_preds = np.concatenate(preds, axis=0)
total_labels = np.concatenate(labels, axis=0)

In [67]:
idx = labelindex == 1
output[idx]

tensor([[ 0.2924,  0.1035,  0.2921],
        [ 0.2060, -0.2860,  0.2026],
        [-0.0027,  0.0092, -0.0009],
        ...,
        [ 0.2408, -0.6489,  0.2204],
        [ 0.3515, -0.1712,  0.3376],
        [ 0.1205, -0.1768,  0.1192]], device='cuda:0')

In [68]:
f1 = f1_score(total_labels, total_preds, average='macro')
recall = recall_score(total_labels, total_preds, average='macro')
precision = precision_score(total_labels, total_preds, average='macro')
print(f"f1_score: {f1}, recall: {recall}, precision: {precision}, accuracy: {acces.avg}")

f1_score: 0.3804113594467117, recall: 0.39301411655230734, precision: 0.41423833991569414, accuracy: 0.392152034503856


In [69]:
pred_classes_num = []
label_classes_num = []
acc_classes = []
f1_classes = []
recall_classes = []
precision_classes = []

for i in range(3):
    preds_i = (total_preds==i)
    labels_i = (total_labels==i)
    pred_classes_num.append((preds_i).sum())
    label_classes_num.append((labels_i).sum())
    acc_classes.append(accuracy_score(labels_i, preds_i))
    f1_classes.append(f1_score(labels_i, preds_i))
    recall_classes.append(recall_score(labels_i, preds_i))
    precision_classes.append(precision_score(labels_i, preds_i))

    
print(pred_classes_num)
print(label_classes_num)
print(acc_classes)
print(f1_classes)
print(recall_classes)
print(precision_classes)

[348521, 114367, 199067]
[219132, 224969, 217854]
[0.5490312785612315, 0.6730019412195694, 0.5622708492269112]
[0.4741135869976905, 0.3621130678737299, 0.30500742346871473]
[0.6140864866838253, 0.273099849312572, 0.29185601366052494]
[0.38610585875743497, 0.5372091599849607, 0.31940000100468685]


In [70]:
cm = confusion_matrix(total_labels, total_preds)
cm

array([[134566,  23965,  60601],
       [ 88646,  61439,  74884],
       [125309,  28963,  63582]])

In [71]:
label = 1
tensor([[0.3568, 0.3054, 0.3379],
        [0.3461, 0.2822, 0.3717],
        [0.2830, 0.4342, 0.2828],
        [0.3920, 0.2397, 0.3684],
        [0.3633, 0.2465, 0.3902],
        [0.4089, 0.2402, 0.3509]], device='cuda:0')

NameError: name 'tensor' is not defined