In [1]:
import torch
from torch import nn, optim
from sklearn.preprocessing import StandardScaler

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import numpy as np

# воспроизводимость
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)
import random
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [2]:
def wape(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / np.sum(np.abs(y)) * 100

In [3]:
class TimeSeriesDataSet(torch.utils.data.Dataset):
    def __init__(self, init_data, window_size= 30):
        self.initial_data = init_data
        self.window_size = window_size

        self.transformed_data = []

        self.data_to_X_y()

    def data_to_X_y(self):

        for i in range(len(self.initial_data) - self.window_size):
            row = torch.from_numpy(np.array(self.initial_data[i:i + self.window_size, :], dtype= np.float32).T)
            column = torch.from_numpy(np.array(self.initial_data[i + self.window_size], dtype= np.float32).reshape(-1, 1))
            self.transformed_data.append([row, column])

    def __len__(self):
        return len(self.transformed_data)

    def __getitem__(self, item):
        return self.transformed_data[item][0], self.transformed_data[item][1]

class LN_VAR(nn.Module):
    def __init__(self, first_part, second_part, use_dropout= False, device= "cuda"):
        super().__init__()
        self.lags_val = first_part[0]
        self.in_features = first_part[1]

        self.feature_to_number = nn.Parameter(torch.randn(self.in_features, self.lags_val), requires_grad= True).to(device)
        self.act = nn.LeakyReLU().to(device)

        layers = []
        for i in range(1, len(second_part)):
            layers.append(nn.Linear(second_part[i - 1], second_part[i]))
            layers.append(self.act)
            layers.append(nn.Dropout(0.2))

        self.fc = nn.Sequential(*layers[:(-2 if use_dropout else -1)]).to(device)

    def forward(self, X):
        X = torch.sum(X * self.feature_to_number, dim= 1)

        return torch.unsqueeze(self.fc(X), dim= 1)

In [4]:
def train(model, criterion, optimizer, epochs, train_data, device= "cuda"):

    best_eval_loss = torch.inf

    train_loader = train_data["train"]
    val_loader = train_data["val"]

    epochs_bar = tqdm(range(epochs), desc= "Epoch")
    for epoch in epochs_bar:
        model.train()
        for ind in range(len(train_loader)):
            X_train_bt, y_train_bt = train_loader[ind]
            def closure():
                optimizer.zero_grad()

                train_preds = model(X_train_bt.to(device))
                train_loss = criterion(train_preds, y_train_bt.to(device))

                model.train()
                train_loss.backward()
                return train_loss

            optimizer.step(closure)

        avg_val_loss = 0

        model.eval()
        with torch.inference_mode():
            for ind in range(len(val_loader)):
                X_val_bt, y_val_bt = val_loader[ind]
                val_preds = model(X_val_bt.to(device))
                val_loss = criterion(val_preds, y_val_bt.to(device))
                # ingroup
                avg_val_loss += val_loss.item()

        # over group
        avg_val_loss /= len(val_loader)

        if avg_val_loss < best_eval_loss:
            best_eval_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")

        epochs_bar.set_description(f"Epoch: {epoch + 1}/{epochs} Val loss: {avg_val_loss:.5f}. Best loss: {best_eval_loss:.5f}")

def eval(model, test_loader, metric, device= "cuda"):
    model = model.to(device)

    answer = []
    y = []

    model.eval()
    with torch.inference_mode():
        test_pbar = tqdm(range(len(test_loader)), desc= "Batch", total= len(test_loader), disable= True)
        for ind in test_pbar:
            test_pbar.set_description(f"Test: {ind + 1}/{len(test_loader)}")
            X_test_bt, y_test_bt = test_loader[ind]

            test_preds = model(X_test_bt.to(device))

            answer += list(test_preds.cpu().detach().numpy())
            y += list(y_test_bt.cpu().detach().numpy())

    return metric(np.array(y).reshape(-1), np.array(answer).reshape(-1))

In [5]:
def get_wapes(path):
    if "best_model.pt" in os.listdir("."):
        os.remove("best_model.pt")

    result_dict = dict()

    entries = os.listdir(path)
    entries.remove(".DS_Store")

    pbar = tqdm(entries, total= len(entries), desc= "Company")
    for file in entries:
        pbar.set_description(f"{file[:-14]}")
        df = pd.read_csv(path + file if path[-1] == r"/" else path + r"/" + file).dropna()
        data = df.drop(["Date", "Volume"], axis= 1)
        series = data.to_numpy()

        scaler = StandardScaler()
        scaler.fit(series)
        series = scaler.transform(series)

        X_train, X_test = train_test_split(series, test_size= 0.1, shuffle= False)
        X_train, X_val = train_test_split(X_train, test_size= 0.2, shuffle= False)

        window_size = 10
        feature_number = X_train.shape[1]

        train_dataset = TimeSeriesDataSet(X_train, window_size)
        val_dataset = TimeSeriesDataSet(X_val, window_size)
        test_dataset = TimeSeriesDataSet(X_test, window_size)

        first = [window_size, feature_number]
        second = [feature_number, 4, feature_number]

        net_struct = [first, second]

        model = LN_VAR(*net_struct)
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(model.parameters(), lr= 3e-4, amsgrad= True)#LBFGS(model.parameters(), history_size= 25)#
        metrics = wape
        device = "cuda" if torch.cuda.is_available() else "cpu"
        epochs = 150

        train_params = {
            "model": model,
            "criterion": criterion,
            "optimizer": optimizer,
            "epochs": epochs,
            "train_data": {
                "train": train_dataset,
                "val": val_dataset
            },
            "device": device
        }
        train(**train_params)

        model = LN_VAR(*net_struct)
        model.load_state_dict(torch.load("best_model.pt"))

        result_wape = eval(model, test_dataset, metrics, device)
        result_dict[file[:-14]] = result_wape

        print(f"{file[:-14]}: Test: {result_wape:.5f}")
        os.remove("best_model.pt")

    return result_dict

In [6]:
us_prices = {
    "path": "../Data/American Companies/",
}
us_returns = {
    "path": "../Data/American Companies/",
}

In [None]:
result_us_prices = get_wapes(**us_prices)

Company:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch:   0%|          | 0/150 [00:00<?, ?it/s]

In [None]:
result_us_returns = get_wapes(**us_returns)

In [None]:

us_result_df = pd.DataFrame({
    "Company": result_us_prices.keys(),
    "WAPE (price)": result_us_prices.values(),
    "WAPE (return)": result_us_returns.values()
})

us_result_df.to_csv("ln_var_us.csv", index= False)

In [None]:
# ch_prices = {
#     "path": "../Data/Chinese Companies/",
#     "use_denoising": False,
#     "prices_not_returns": True
# }
# ch_returns = {
#     "path": "../Data/Chinese Companies/",
#     "use_denoising": False,
#     "prices_not_returns": False
# }
#
# result_ch_prices = get_wapes(**ch_prices)
# result_ch_returns = get_wapes(**ch_returns)
#
# ch_result_df = pd.DataFrame({
#     "Company": result_ch_prices.keys(),
#     "WAPE (price)": result_ch_prices.values(),
#     "WAPE (return)": result_ch_returns.values()
# })
#
# ch_result_df.to_csv("wn_ch.csv", index= False)

# MSSA + WN

In [None]:
# us_prices = {
#     "path": "../Data/American Companies/",
#     "use_denoising": True,
#     "prices_not_returns": True
# }
# us_returns = {
#     "path": "../Data/American Companies/",
#     "use_denoising": True,
#     "prices_not_returns": False
# }
#
# result_us_prices = get_wapes(**us_prices)
# result_us_returns = get_wapes(**us_returns)
#
# us_result_df = pd.DataFrame({
#     "Company": result_us_prices.keys(),
#     "WAPE (price)": result_us_prices.values(),
#     "WAPE (return)": result_us_returns.values()
# })
#
# us_result_df.to_csv("mssa_wn_us.csv", index=False)

In [None]:
# ch_prices = {
#     "path": "../Data/Chinese Companies/",
#     "use_denoising": True,
#     "prices_not_returns": True
# }
# ch_returns = {
#     "path": "../Data/Chinese Companies/",
#     "use_denoising": True,
#     "prices_not_returns": False
# }
#
# result_ch_prices = get_wapes(**ch_prices)
# result_ch_returns = get_wapes(**ch_returns)
#
# ch_result_df = pd.DataFrame({
#     "Company": result_ch_prices.keys(),
#     "WAPE (price)": result_ch_prices.values(),
#     "WAPE (return)": result_ch_returns.values()
# })
#
# ch_result_df.to_csv("mssa_wn_ch.csv", index=False)