In [1]:
import torch
from torch import nn, optim
from Functions.SSA import SSA

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import numpy as np

import matplotlib.pyplot as plt

# воспроизводимость
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)
import random
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [2]:
def wape(y, y_hat):
    return np.sum(np.abs(y - y_hat)) / np.sum(np.abs(y)) * 100

def mape(y, y_hat, eps= 1e-7):
    return np.mean(np.abs((y - y_hat) / (y + eps))) * 100

def series_to_X_y(series, windows_size= 30):
    X = []
    y = []

    for i in range(len(series) - windows_size):
        row = series[i:i + windows_size]
        X.append(row)
        column = [series[i + windows_size]]
        y.append(column)

    return np.asmatrix(X), np.asarray(y)

class MLP(nn.Module):
    def __init__(self, neurons, use_dropout= False, use_batchnorm= False):
        super(MLP, self).__init__()

        layers = []
        for i in range(1, len(neurons)):
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(neurons[i - 1]))
            layers.append(nn.Linear(in_features= neurons[i - 1], out_features= neurons[i]))
            layers.append(nn.ReLU())
            if use_dropout:
                layers.append(nn.Dropout(0.2))

        self.fc = nn.Sequential(*(layers[:-2] if use_dropout else layers[:-1]))

    def forward(self, X):
        return self.fc(X)

def get_ewma(data, window):

    alpha = 2 /(window + 1.0)
    alpha_rev = 1-alpha
    n = data.shape[0]

    pows = alpha_rev**(np.arange(n+1))

    scale_arr = 1/pows[:-1]
    offset = data[0]*pows[1:]
    pw0 = alpha*alpha_rev**(n-1)

    mult = data*pw0*scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums*scale_arr[::-1]
    return out

def get_all_ewma(y, alpha= 2, n= 5):
    v = []
    beta = 1 - alpha / (n + 1)

    for i in range(1, len(y) + 1):
        betas = (beta * np.ones(i)) ** np.arange(i)
        addition = np.dot(betas, (y[:i])[::-1])
        new_value = (1 - beta) * addition
        correction = (1 - beta ** (i)) ** (-1)
        v.append(new_value * correction)

    return np.array(v)

In [3]:
def get_wapes(path, country, prices_not_returns, windows_size, output_size, neurons, metrics, use_dropout, use_batchnorm, epochs, val_size, test_size, device, use_denoising, use_ewma):

    neurons = [windows_size, *neurons, output_size]
    checker = 0

    result_dict = dict()

    entries = os.listdir(path)
    entries.remove(".DS_Store")

    pbar_company = tqdm(entries)
    for file in pbar_company:
        pbar_company.set_description(f"{country}: Epoch")
        df = pd.read_csv(path + file if path[-1] == r"/" else path + r"/" + file).dropna()
        series = np.array(df.Open) if prices_not_returns else np.array(df.Open.pct_change())[1:]
        series_initial = series[:]

        if use_denoising:
            _, series = SSA.multiple_stage_denoising(series, max_iter= 50)

        X, _ = series_to_X_y(series, windows_size)
        _, y = series_to_X_y(series_initial, windows_size)

        if use_ewma:
            ewmas = np.asmatrix([get_all_ewma(series_initial, 2, i) for i in range(2, 30)])
            X = np.concatenate([X, ewmas[:, windows_size - 1:-1].T], axis= 1)
            if checker == 0:
                neurons[0] += (X.shape[1] - windows_size)
                checker = 1

        X = torch.from_numpy(X).to(device).to(torch.float32)
        y = torch.from_numpy(y).to(device).to(torch.float32)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, shuffle= False)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= val_size, shuffle= False)

        X_train = X_train.to(device)
        X_val = X_val.to(device)
        X_test = X_test.to(device)

        y_train = y_train.to(device)
        y_val = y_val.to(device)
        y_test = y_test.to(device)

        model = MLP(neurons, use_dropout= use_dropout, use_batchnorm= use_batchnorm).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(model.parameters(), lr= 1e-3, amsgrad= True)#LBFGS(model.parameters())

        best_model = None
        best_val_loss = torch.inf

        pbar_train = tqdm(range(epochs), desc= "Epoch")
        for epoch in pbar_train:
            def closure():
                model.train()
                optimizer.zero_grad()

                train_pred = model(X_train)

                global train_loss
                train_loss = criterion(y_train, train_pred)

                train_loss.backward()
                return train_loss

            closure()
            optimizer.step()

            model.eval()
            with torch.inference_mode():
                val_preds = model(X_val)
                val_loss = criterion(y_val, val_preds)

            if val_loss.item() < best_val_loss:
                best_val_loss = val_loss.item()
                best_model = model.state_dict()
                torch.save(best_model, "best_model.pt")

            pbar_train.set_description(f"{file[:-14]}: Epoch: {epoch + 1}/{epochs}. Train loss: {train_loss.item():.5f} Val loss: {val_loss.item():.5f}")

        model = MLP(neurons, use_dropout= use_dropout, use_batchnorm= use_batchnorm).to(device)
        model.load_state_dict(torch.load("best_model.pt"))
        model.eval()

        with torch.inference_mode():
            y_hat = model(X_test).detach().cpu().numpy()
            y_test = y_test.detach().cpu().numpy()
            metrics_val = metrics(y_test, y_hat)

        result_dict[file[:-14]] = metrics_val

        print(f"{file[:-14]}: Test error: {result_dict[file[:-14]]:.2f}%")
        os.remove("best_model.pt")


    return result_dict


In [8]:
params_us_prices = {
    "path": "../Data/American Companies",
    "country": "US",
    "prices_not_returns": True,

    "windows_size": 100,
    "output_size": 1,
    "neurons": [8, 8],

    "metrics": wape,

    "use_dropout": False,
    "use_batchnorm": False,

    "epochs": 10_000,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": False,
    "use_ewma": True
}

params_us_returns = {
    "path": "../Data/American Companies",
    "country": "US",
    "prices_not_returns": False,

    "windows_size": 100,
    "output_size": 1,
    "neurons": [6],

    "metrics": wape,

    "use_dropout": False,
    "use_batchnorm": False,

    "epochs": 60,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": False,
    "use_ewma": True
}

params_ch_prices = {
    "path": "../Data/Chinese Companies",
    "country": "CH",
    "prices_not_returns": True,

    "windows_size": 100,
    "output_size": 1,
    "neurons": [8, 8],

    "metrics": wape,

    "use_dropout": False,
    "use_batchnorm": False,

    "epochs": 10_000,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": False,
    "use_ewma": True
}

params_ch_returns = {
    "path": "../Data/Chinese Companies",
    "country": "CH",
    "prices_not_returns": False,

    "windows_size": 100,
    "output_size": 1,
    "neurons": [6],

    "metrics": wape,

    "use_dropout": False,
    "use_batchnorm": False,

    "epochs": 60,
    "val_size": 0.2,
    "test_size": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_denoising": False,
    "use_ewma": True
}

In [None]:
result_dict_prices_us = get_wapes(**params_us_prices)

In [6]:
result_dict_prices_us

{'AMZN': 3.041609190404415,
 'MSFT': 1.986631378531456,
 'GM': 2.8394917026162148,
 'UBER': 3.497477248311043,
 'Coca Cola': 1.2191321700811386,
 'EBAY': 1.9702063873410225,
 'DIS': 1.3135253451764584,
 'FORD': 3.2642636448144913,
 'NFLX': 93.50230097770691,
 'NKE': 2.1333755925297737,
 'GE': 2.2267334163188934,
 'WMT': 1.2006881646811962,
 'INTC': 2.100864052772522,
 'GOOG': 2.256414294242859,
 'AAPL': 1.8608694896101952}

In [None]:
# result_dict_returns_us = get_wapes(**params_us_returns)

In [9]:
result_dict_prices_ch = get_wapes(**params_ch_prices)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

China Shenhua: Test error: 2.89%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Agricultural Bank of China: Test error: 4.83%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Wanhua Chemical Group: Test error: 2.89%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Anhui Coonch Cement: Test error: 1.83%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

CITIC Securities: Test error: 1.45%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Kweichow Moutai: Test error: 2.75%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

China Duty Free Group: Test error: 3.38%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Ping An: Test error: 82.18%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

China Pacific Insurance: Test error: 1.47%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

PetroChina: Test error: 2.50%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

SAIC Motor: Test error: 1.90%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

China Merchants Bank: Test error: 2.16%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Sinopec: Test error: 2.55%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

Hengrui Medicine: Test error: 2.58%


Epoch:   0%|          | 0/10000 [00:00<?, ?it/s]

China Life Insurance Company Limited: Test error: 2.18%


In [10]:
result_dict_prices_ch

{'China Shenhua': 2.891355939209461,
 'Agricultural Bank of China': 4.832610860466957,
 'Wanhua Chemical Group': 2.8875431045889854,
 'Anhui Coonch Cement': 1.829676702618599,
 'CITIC Securities': 1.4518539421260357,
 'Kweichow Moutai': 2.7544038370251656,
 'China Duty Free Group': 3.38163822889328,
 'Ping An': 82.18486309051514,
 'China Pacific Insurance': 1.4706388115882874,
 'PetroChina': 2.497771754860878,
 'SAIC Motor': 1.899942196905613,
 'China Merchants Bank': 2.1610205993056297,
 'Sinopec': 2.5486962869763374,
 'Hengrui Medicine': 2.5800123810768127,
 'China Life Insurance Company Limited': 2.1849919110536575}

In [None]:
# result_dict_returns_ch = get_wapes(**params_ch_returns)

In [11]:
us_result = pd.DataFrame({
    "Company": result_dict_prices_us.keys(),
    "WAPE (price)": result_dict_prices_us.values(),
    # "WAPE (return)": result_dict_returns_us.values()
})

ch_result = pd.DataFrame({
    "Company": result_dict_prices_ch.keys(),
    "WAPE (price)": result_dict_prices_ch.values(),
    # "WAPE (return)": result_dict_returns_ch.values()
})

In [12]:
us_result

Unnamed: 0,Company,WAPE (price)
0,AMZN,3.041609
1,MSFT,1.986631
2,GM,2.839492
3,UBER,3.497477
4,Coca Cola,1.219132
5,EBAY,1.970206
6,DIS,1.313525
7,FORD,3.264264
8,NFLX,93.502301
9,NKE,2.133376


In [13]:
round(us_result.describe(), 2)

Unnamed: 0,WAPE (price)
count,15.0
mean,8.29
std,23.58
min,1.2
25%,1.92
50%,2.13
75%,2.94
max,93.5


In [14]:
ch_result

Unnamed: 0,Company,WAPE (price)
0,China Shenhua,2.891356
1,Agricultural Bank of China,4.832611
2,Wanhua Chemical Group,2.887543
3,Anhui Coonch Cement,1.829677
4,CITIC Securities,1.451854
5,Kweichow Moutai,2.754404
6,China Duty Free Group,3.381638
7,Ping An,82.184863
8,China Pacific Insurance,1.470639
9,PetroChina,2.497772


In [15]:
round(ch_result.describe(), 2)

Unnamed: 0,WAPE (price)
count,15.0
mean,7.84
std,20.58
min,1.45
25%,2.03
50%,2.55
75%,2.89
max,82.18


In [16]:
us_result.to_csv("ewma_mlp_us_price.csv", index= False)
ch_result.to_csv("ewma_mlp_ch_price.csv", index= False)