In [1]:
import torch
from torch import nn, optim
from Functions.SSA import SSA
from Functions.lions_optimizer import Lion

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import numpy as np

# воспроизводимость
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)
import random
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [2]:
def wape(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / np.sum(np.abs(y)) * 100

def mape(y, y_hat, eps= 1e-7):
    return np.mean(np.abs((y - y_hat) / (y + eps))) * 100

def series_to_X_y(series, window_size= 30):
    X = []
    y = []
    for i in range(len(series) - window_size):
        row = list(map(lambda a: a, series[i:i + window_size]))
        X.append(np.array(row))
        col = [series[i + window_size]]
        y.append(np.array(col))

    return np.array(X), np.array(y)

In [3]:
class MinMaxScaler():
    def __init__(self):
        self.min = 0
        self.max = 1

    def fit(self, data):
        self.min = data.min()
        self.max = data.max()

    def transform(self, data):
        return (data - self.min) / (self.max - self.min)

    def inverse_transform(self, data):
        return (self.max - self.min) * data + self.min

    def __str__(self):
        return f"Min: {self.min}\nMax: {self.max}"

class StandardScaler():
    def __init__(self):
        self.mean = 0
        self.std = 1

    def fit(self, data):
        self.mean = data.mean()
        self.std = data.std()

    def transform(self, data):
        return (data - self.mean) / self.std

    def inverse_transform(self, data):
        return self.std * data + self.mean

    def __str__(self):
        return f"Mean: {self.mean}\nStd: {self.std}"

class MexicanHat(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return (1 - X ** 2) * torch.exp(-1 / 2 * X ** 2)

class WN(nn.Module):
    def __init__(self, input_features, number_of_wavelons= 1):
        super(WN, self).__init__()
        # сдвиг
        self.translation = torch.nn.Parameter(torch.rand([input_features, number_of_wavelons], requires_grad= True))
        # сжатие
        self.dilation = torch.nn.Parameter(torch.rand([input_features, number_of_wavelons], requires_grad= True))

        # веса линейной части
        self.lm_weights = torch.nn.Parameter(torch.rand(1 + input_features, requires_grad= True))
        # веса wavelet части
        self.wn_weights = torch.nn.Parameter(torch.rand(number_of_wavelons, requires_grad= True))

        self.number_of_wavelons = number_of_wavelons
        self.input_features = input_features

        self.activation_function = MexicanHat()

    def forward(self, X, device= "cpu"):
        output = torch.zeros(X.shape[0]).to(device)

        for i in range(len(X)):
            vector = X[i]
            J = torch.concat([torch.unsqueeze(vector, dim= 1) for _ in range(self.number_of_wavelons)], dim= 1).to(device)
            Z = (J - self.translation.to(device)) / self.dilation.to(device)

            Z_new = self.activation_function(Z)

            Z_new_trans = torch.t(Z_new)
            Z_final = torch.prod(Z_new_trans, dim= 1)

            wavelet_net = torch.dot(Z_final, self.wn_weights.to(device))
            linear_net = torch.dot(torch.concat([torch.tensor([1], dtype= torch.float32).to(device), vector.to(device)]), self.lm_weights.to(device))
            output[i] = wavelet_net + linear_net

        return torch.unsqueeze(output, dim= 1)

class TimeSeriesDataSet(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [4]:
def get_ewma(data, window):

    alpha = 2 /(window + 1.0)
    alpha_rev = 1-alpha
    n = data.shape[0]

    pows = alpha_rev**(np.arange(n+1))

    scale_arr = 1/pows[:-1]
    offset = data[0]*pows[1:]
    pw0 = alpha*alpha_rev**(n-1)

    mult = data*pw0*scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums*scale_arr[::-1]
    return out

def get_all_ewma(y, alpha= 2, n= 5):
    v = []
    beta = 1 - alpha / (n + 1)

    for i in range(1, len(y) + 1):
        betas = (beta * np.ones(i)) ** np.arange(i)
        addition = np.dot(betas, (y[:i])[::-1])
        new_value = (1 - beta) * addition
        correction = (1 - beta ** (i)) ** (-1)
        v.append(new_value * correction)

    return np.array(v)

In [5]:
def train(model, criterion, optimizer, epochs, train_data, device= "cpu"):

    model = model.to(device)
    best_model = None
    best_eval_loss = torch.inf

    train_loader = train_data["train"]
    val_loader = train_data["val"]

    epochs_bar = tqdm(range(epochs), desc= "Epoch")
    for epoch in epochs_bar:

        avg_train_loss = 0
        avg_train_metric = 0
        train_len = 0
        standard_len = None

        model.train()
        for ind, (X_train_bt, y_train_bt) in enumerate(train_loader):
            optimizer.zero_grad()

            train_preds = model(X_train_bt.to(device), device= device)
            train_loss = criterion(train_preds, y_train_bt.to(device))

            # ingroup
            avg_train_loss += train_loss.item()

            if standard_len is None:
                standard_len = X_train_bt.shape[0]

            train_len += (X_train_bt.shape[0] / standard_len)

            model.train()
            train_loss.backward()
            optimizer.step()

        # over group
        avg_train_loss /= train_len
        avg_train_metric /= train_len

        avg_val_loss = 0
        val_len = 0
        standard_len = None

        model.eval()
        with torch.inference_mode():
            for ind, (X_val_bt, y_val_bt) in enumerate(val_loader):
                val_preds = model(X_val_bt.to(device), device= device)
                val_loss = criterion(val_preds, y_val_bt.to(device))
                # ingroup
                avg_val_loss += val_loss.item()

                if standard_len is None:
                    standard_len = X_val_bt.shape[0]

                val_len += (X_val_bt.shape[0] / standard_len)

        # over group
        avg_val_loss /= val_len

        ans = avg_val_loss < best_eval_loss
        if ans:
            best_eval_loss = avg_val_loss
            best_model = model.state_dict()
            torch.save(best_model, "best_model.pt")

        epochs_bar.set_description(f"Epoch: {epoch + 1}/{epochs} Val loss: {avg_val_loss:.5f}. Best loss: {best_eval_loss:.5f}")

def eval(model, test_loader, metric, device= "cpu"):
    model = model.to(device)

    answer = []
    y = []

    model.eval()
    with torch.inference_mode():
        test_pbar = tqdm(enumerate(test_loader), desc= "Batch", total= len(test_loader), disable= True)
        for ind, (X_test_bt, y_test_bt) in test_pbar:
            test_pbar.set_description(f"Test: {ind + 1}/{len(test_loader)}")

            test_preds = model(X_test_bt.to(device), device= device)

            answer += list(test_preds.cpu().detach().numpy())
            y += list(y_test_bt.cpu().detach().numpy())

    return metric(np.array(y).reshape(-1), np.array(answer).reshape(-1))

In [6]:
def get_wapes(path, country, prices_not_returns, windows_size, neurons, metrics, epochs, val_size, test_size, device, use_denoising, use_ewma):

    checker = 0

    result_dict = dict()

    entries = os.listdir(path)
    entries.remove(".DS_Store")

    pbar_company = tqdm(entries)
    for file in pbar_company:
        pbar_company.set_description(f"{country}: Epoch")
        df = pd.read_csv(path + file if path[-1] == r"/" else path + r"/" + file).dropna()
        series = np.array(df.Open) if prices_not_returns else np.array(df.Open.pct_change())[1:]
        series_initial = series[:]

        if use_denoising:
            _, series = SSA.multiple_stage_denoising(series, max_iter= 50)

        X, _ = series_to_X_y(series, windows_size)
        _, y = series_to_X_y(series_initial, windows_size)

        if use_ewma:
            ewmas = np.asmatrix([get_all_ewma(series_initial, 2, i) for i in range(2, 30)])
            X = np.concatenate([X, ewmas[:, windows_size - 1:-1].T], axis= 1)
            if checker == 0:
                neurons["input_dim"] += (X.shape[-1] - windows_size)
                checker = 1

        X = torch.from_numpy(X).to(device).to(torch.float32)
        y = torch.from_numpy(y).to(device).to(torch.float32)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, shuffle= False)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= val_size, shuffle= False)

        # скалирование
        scaler = MinMaxScaler()

        scaler.fit(torch.concat((X_train[0].reshape(-1, 1), y_train, y_val), dim= 0))

        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)

        y_train = scaler.transform(y_train)
        y_val = scaler.transform(y_val)
        y_test = scaler.transform(y_test)
        #

        X_train = X_train.to(device)
        X_val = X_val.to(device)
        X_test = X_test.to(device)

        y_train = y_train.to(device)
        y_val = y_val.to(device)
        y_test = y_test.to(device)

        if checker == 0:
            neurons = [windows_size] + neurons
            checker = 1

        model = WN(*neurons).to(device)
        criterion = nn.MSELoss()
        optimizer = Lion(model.parameters(), lr= 3e-4 / 2)

        #### Обучение
        train_dataset = TimeSeriesDataSet(X_train, y_train)
        val_dataset = TimeSeriesDataSet(X_val, y_val)
        test_dataset = TimeSeriesDataSet(X_test, y_test)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
                batch_size= 256,
            shuffle= False
        )
        val_loader = torch.utils.data.DataLoader(
            val_dataset,
                batch_size= 256,
            shuffle= False
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
                batch_size= 256,
            shuffle= False,
        )

        train_params = {
            "model": model,
            "criterion": criterion,
            "optimizer": optimizer,
            "epochs": epochs,
            "train_data": {
                "train": train_loader,
                "val": val_loader
            },
            "device": device
        }
        train(**train_params)

        #### Вычисление
        model = WN(*neurons)
        model.load_state_dict(torch.load("best_model.pt"))

        metrics_val = eval(model, test_loader, metrics, device= device)
        ####

        result_dict[file[:-14]] = metrics_val

        print(f"{file[:-14]}: Test error: {result_dict[file[:-14]]:.2f}%")
        os.remove("best_model.pt")


    return result_dict


In [7]:
params_us_prices = {
    "path": "../Data/American Companies",
    "country": "US",
    "prices_not_returns": True,

    "windows_size": 5,
    "neurons": [5],

    "metrics": wape,

    "epochs": 500,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": False,
    "use_ewma": False
}

params_us_returns = {
    "path": "../Data/American Companies",
    "country": "US",
    "prices_not_returns": False,

    "windows_size": 100,
    "output_size": 1,
    "neurons": {
        "hid_dim": 350,
        "num_layers": 2,
        "type": "lstm",
        "dropout": 0.0
    },

    "metrics": wape,

    "epochs": 10_000,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": True,
    "use_ewma": False
}

params_ch_prices = {
    "path": "../Data/Chinese Companies",
    "country": "CH",
    "prices_not_returns": True,

    "windows_size": 100,
    "output_size": 1,
    "neurons": {
        "hid_dim": 350,
        "num_layers": 2,
        "type": "gru",
        "dropout": 0.0
    },

    "metrics": wape,

    "epochs": 10_000,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": True,
    "use_ewma": False
}

params_ch_returns = {
    "path": "../Data/Chinese Companies",
    "country": "CH",
    "prices_not_returns": False,

    "windows_size": 100,
    "output_size": 1,
    "neurons": {
        "hid_dim": 350,
        "num_layers": 2,
        "type": "gru",
        "dropout": 0.0
    },

    "metrics": wape,

    "epochs": 10_000,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": True,
    "use_ewma": False
}

In [8]:
result_dict_prices_us = get_wapes(**params_us_prices)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch:   0%|          | 0/500 [00:00<?, ?it/s]

AMZN: Test error: 62.36%


<class 'TypeError'>: WN.__init__() takes from 2 to 3 positional arguments but 4 were given

In [None]:
# result_dict_returns_us = get_wapes(**params_us_returns)

In [None]:
# result_dict_prices_ch = get_wapes(**params_ch_prices)

In [None]:
# result_dict_returns_ch = get_wapes(**params_ch_returns)

In [None]:
# us_result = pd.DataFrame({
#     "Company": result_dict_prices_us.keys(),
#     "WAPE (price)": result_dict_prices_us.values(),
#     "WAPE (return)": result_dict_returns_us.values()
# })
#
# ch_result = pd.DataFrame({
#     "Company": result_dict_prices_ch.keys(),
#     "WAPE (price)": result_dict_prices_ch.values(),
#     "WAPE (return)": result_dict_returns_ch.values()
# })

In [None]:
# us_result

In [None]:
# round(us_result.describe(), 2)

In [None]:
# ch_result

In [None]:
# round(ch_result.describe(), 2)

In [None]:
# us_result.to_csv("wn_us_price.csv", index= False)
# ch_result.to_csv("wn_ch_price.csv", index= False)