In [1]:
import torch
from torch import nn, optim
from Functions.SSA import SSA
from Functions.lions_optimizer import Lion


import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import numpy as np

# воспроизводимость
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)
import random
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [2]:
def wape(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / np.sum(np.abs(y)) * 100

def series_to_X_y(series, window_size):
    X = []
    y = []
    for i in range(len(series) - window_size):
        row = list(map(lambda a: a, series[i:i + window_size]))
        X.append(np.array(row))
        col = [series[i + window_size]]
        y.append(np.array(col))

    return np.array(X), np.array(y)

In [3]:
class MinMaxScaler():
    def __init__(self):
        self.min = 0
        self.max = 1

    def fit(self, data):
        self.min = data.min()
        self.max = data.max()

    def transform(self, data):
        return (data - self.min) / (self.max - self.min)

    def inverse_transform(self, data):
        return (self.max - self.min) * data + self.min

    def __str__(self):
        return f"Min: {self.min}\nMax: {self.max}"

class StandardScaler():
    def __init__(self):
        self.mean = 0
        self.std = 1

    def fit(self, data):
        self.mean = data.mean()
        self.std = data.std()

    def transform(self, data):
        return (data - self.mean) / self.std

    def inverse_transform(self, data):
        return self.std * data + self.mean

    def __str__(self):
        return f"Mean: {self.mean}\nStd: {self.std}"

class MexicanHat(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return (1 - X ** 2) * torch.exp(-1 / 2 * X ** 2)

class TimeSeriesDataSet(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [4]:
def train(model, criterion, optimizer, epochs, train_data, device= "cuda"):

    best_eval_loss = torch.inf

    train_loader = train_data["train"]
    val_loader = train_data["val"]

    epochs_bar = tqdm(range(epochs), desc= "Epoch")
    for epoch in epochs_bar:
        model.train()
        for ind, (X_train_bt, y_train_bt) in enumerate(train_loader):
            def closure():
                optimizer.zero_grad()

                train_preds = model(X_train_bt.to(device), device= device)
                train_loss = criterion(train_preds, y_train_bt.to(device))

                model.train()
                train_loss.backward()
                return train_loss

            optimizer.step(closure)

        avg_val_loss = 0
        val_len = 0
        standard_len = None

        model.eval()
        with torch.inference_mode():
            for ind, (X_val_bt, y_val_bt) in enumerate(val_loader):
                val_preds = model(X_val_bt.to(device), device= device)
                val_loss = criterion(val_preds, y_val_bt.to(device))
                # ingroup
                avg_val_loss += val_loss.item()

                if standard_len is None:
                    standard_len = X_val_bt.shape[0]

                val_len += (X_val_bt.shape[0] / standard_len)

        # over group
        avg_val_loss /= val_len

        if avg_val_loss < best_eval_loss:
            best_eval_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")

        epochs_bar.set_description(f"Epoch: {epoch + 1}/{epochs} Val loss: {avg_val_loss:.5f}. Best loss: {best_eval_loss:.5f}")

def eval(model, test_loader, metric, device= "cuda"):
    model = model.to(device)

    answer = []
    y = []

    model.eval()
    with torch.inference_mode():
        test_pbar = tqdm(enumerate(test_loader), desc= "Batch", total= len(test_loader), disable= True)
        for ind, (X_test_bt, y_test_bt) in test_pbar:
            test_pbar.set_description(f"Test: {ind + 1}/{len(test_loader)}")

            test_preds = model(X_test_bt.to(device), device= device)

            answer += list(test_preds.cpu().detach().numpy())
            y += list(y_test_bt.cpu().detach().numpy())

    return metric(np.array(y).reshape(-1), np.array(answer).reshape(-1))

In [5]:
class WN(nn.Module):
    def __init__(self, in_features, out_features, number_of_wavelons= 1):
        super(WN, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.number_of_wavelons = number_of_wavelons
        # инициализация сдвига/растяжения/материнского wavelet'а
        self.act = MexicanHat()
        self.Tr = torch.nn.Parameter(torch.rand([number_of_wavelons, in_features], requires_grad= True)).to("cuda")
        self.Dil = torch.nn.Parameter(torch.rand([number_of_wavelons, in_features], requires_grad= True)).to("cuda")
        # для линейной модели
        self.fc_linear = nn.Linear(self.in_features,self.out_features,bias= True).to("cuda")
        # для wavelon'ов
        self.fc_wn = nn.Linear(self.number_of_wavelons,self.out_features,bias= False).to("cuda")

    def forward(self, x_array, device= "cuda"):
        answer = torch.zeros(x_array.shape[0]).to(device)
        for ind, x in enumerate(x_array):
            X = x.repeat(self.number_of_wavelons, 1)

            Z = X * self.Dil + self.Tr
            Z_hat = self.act(Z)
            z_hat = torch.prod(Z_hat, dim= 1)
            answer[ind] = self.fc_wn(z_hat) + self.fc_linear(x)

        return answer.reshape(-1, 1)

In [6]:
def get_wapes(path, use_denoising, prices_not_returns):
    if "best_model.pt" in os.listdir("."):
        os.remove("best_model.pt")

    result_dict = dict()

    entries = os.listdir(path)
    entries.remove(".DS_Store")

    pbar = tqdm(entries, total= len(entries), desc= "Company")
    for file in entries:
        pbar.set_description(f"{file[:-14]}")
        df = pd.read_csv(path + file if path[-1] == r"/" else path + r"/" + file).dropna()
        series = np.array(df.Open, dtype= np.float32) if prices_not_returns else np.array(df.Open.pct_change(), dtype= np.float32)[1:]
        series_initial = series[:]

        scaler = StandardScaler()
        scaler.fit(series)
        series = scaler.transform(series)

        if use_denoising:
            _, series = SSA.multiple_stage_denoising(series, max_iter= 50)

        X, _ = series_to_X_y(series, 10)
        _, y = series_to_X_y(series_initial, 10)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, shuffle= False)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.2, shuffle= False)

        net_struct = [X_train.shape[1], 1, 5]

        model = WN(*net_struct)
        criterion = nn.MSELoss()
        optimizer = optim.LBFGS(model.parameters(), history_size= 20)#AdamW(model.parameters(), lr= 3e-4, amsgrad= True)#
        metrics = wape
        device = "cuda" if torch.cuda.is_available() else "cpu"
        epochs = 30

        train_dataset = TimeSeriesDataSet(X_train, y_train)
        val_dataset = TimeSeriesDataSet(X_val, y_val)
        test_dataset = TimeSeriesDataSet(X_test, y_test)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size= 256, shuffle= False)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size= 256, shuffle= False)
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size= 256, shuffle= False)

        train_params = {
            "model": model,
            "criterion": criterion,
            "optimizer": optimizer,
            "epochs": epochs,
            "train_data": {
                "train": train_loader,
                "val": val_loader
            },
            "device": device
        }
        train(**train_params)

        model = WN(*net_struct)
        model.load_state_dict(torch.load("best_model.pt"))

        result_wape = eval(model, test_loader, metrics, device)
        result_dict[file[:-14]] = result_wape

        print(f"{file[:-14]}: Test: {result_wape:.5f}")
        os.remove("best_model.pt")

    return result_dict

In [None]:
us_prices = {
    "path": "../Data/American Companies/",
    "use_denoising": False,
    "prices_not_returns": True
}
us_returns = {
    "path": "../Data/American Companies/",
    "use_denoising": False,
    "prices_not_returns": False
}

result_us_prices = get_wapes(**us_prices)
result_us_returns = get_wapes(**us_returns)

us_result_df = pd.DataFrame({
    "Company": result_us_prices.keys(),
    "WAPE (price)": result_us_prices.values(),
    "WAPE (return)": result_us_returns.values()
})

us_result_df.to_csv("wn_us.csv", index= False)

Company:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

AMZN: Test: 1.95137


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

MSFT: Test: 48.86101


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

GM: Test: 2.32108


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

UBER: Test: 3.16840


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Coca Cola: Test: 1.67915


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

EBAY: Test: 1.94171


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

DIS: Test: 1.72742


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

FORD: Test: 2.11005


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

NFLX: Test: 1.94619


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

NKE: Test: 1.48087


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

GE: Test: 2.41970


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

WMT: Test: 3.89498


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

INTC: Test: 1.95193


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

GOOG: Test: 1.42292


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

AAPL: Test: 1.80235


Company:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

AMZN: Test: 111.91950


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

MSFT: Test: 101.16503


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

GM: Test: 125.85751


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

UBER: Test: 106.98364


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Coca Cola: Test: 143.47063


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

EBAY: Test: 623.63052


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
ch_prices = {
    "path": "../Data/Chinese Companies/",
    "use_denoising": False,
    "prices_not_returns": True
}
ch_returns = {
    "path": "../Data/Chinese Companies/",
    "use_denoising": False,
    "prices_not_returns": False
}

result_ch_prices = get_wapes(**ch_prices)
result_ch_returns = get_wapes(**ch_returns)

ch_result_df = pd.DataFrame({
    "Company": result_ch_prices.keys(),
    "WAPE (price)": result_ch_prices.values(),
    "WAPE (return)": result_ch_returns.values()
})

ch_result_df.to_csv("wn_ch.csv", index= False)

# MSSA + WN

In [None]:
us_prices = {
    "path": "../Data/American Companies/",
    "use_denoising": True,
    "prices_not_returns": True
}
us_returns = {
    "path": "../Data/American Companies/",
    "use_denoising": True,
    "prices_not_returns": False
}

result_us_prices = get_wapes(**us_prices)
result_us_returns = get_wapes(**us_returns)

us_result_df = pd.DataFrame({
    "Company": result_us_prices.keys(),
    "WAPE (price)": result_us_prices.values(),
    "WAPE (return)": result_us_returns.values()
})

us_result_df.to_csv("mssa_wn_us.csv", index=False)

In [None]:
ch_prices = {
    "path": "../Data/Chinese Companies/",
    "use_denoising": True,
    "prices_not_returns": True
}
ch_returns = {
    "path": "../Data/Chinese Companies/",
    "use_denoising": True,
    "prices_not_returns": False
}

result_ch_prices = get_wapes(**ch_prices)
result_ch_returns = get_wapes(**ch_returns)

ch_result_df = pd.DataFrame({
    "Company": result_ch_prices.keys(),
    "WAPE (price)": result_ch_prices.values(),
    "WAPE (return)": result_ch_returns.values()
})

ch_result_df.to_csv("mssa_wn_ch.csv", index=False)