In [1]:
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# воспроизводимость
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)
import random
random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [2]:
path = "../Data/American Companies/"
entries = os.listdir(path)
entries.remove(".DS_Store")

In [3]:
df = pd.read_csv(path + entries[1]).dropna()
data = df.drop(["Date", "Volume"], axis= 1)
x = data.to_numpy()

In [4]:
scaler = MinMaxScaler()
scaler.fit(x)
x_tr = scaler.transform(x)

In [5]:
X_train, X_test = train_test_split(x_tr, test_size= 0.15, shuffle= False)
X_train, X_val = train_test_split(X_train, test_size= 0.15, shuffle= False)

In [6]:
def series_to_X_y(series, k):
    X = []
    y = []

    for i in range(len(series) - k):
        row = [series[i + l, :] for l in range(k)]
        row = row[::-1]
        column = series[i + k, :]

        X.append(row)
        y.append(column)
    X = np.asarray(X, dtype= np.float32)
    y = np.asarray(y, dtype= np.float32)
    return torch.from_numpy(X), torch.from_numpy(y)

In [7]:
lag_size = 11
X_train, y_train = series_to_X_y(X_train, lag_size)
X_val, y_val = series_to_X_y(X_val, lag_size)
X_test, y_test = series_to_X_y(X_test, lag_size)

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

(torch.Size([6682, 11, 5]),
 torch.Size([6682, 5]),
 torch.Size([1171, 11, 5]),
 torch.Size([1171, 5]),
 torch.Size([1379, 11, 5]),
 torch.Size([1379, 5]))

In [8]:
class TimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [9]:
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

In [10]:
barch_size = 256
features = X_train.shape[-1]
hidden_size = int(0.7 * features) + 2
num_layers = 1

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size= barch_size, shuffle= False)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size= barch_size, shuffle= False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size= barch_size, shuffle= False)

In [11]:
class TimeSeriesModel(nn.Module):
    def __init__(self, features, hidden_size, num_layers):
        super().__init__()
        self.features = features
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.recurrent_block = nn.GRU(
            input_size= self.features,
            hidden_size= self.hidden_size,
            num_layers= self.num_layers,
            batch_first= True)

        self.fc = nn.Sequential(
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.features))

    def forward(self, X):
        output, _ = self.recurrent_block(X)
        answer = self.fc(output[:, -1, :])
        return answer

In [12]:
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
epochs = 3000

params = {
    "features": features,
    "hidden_size": hidden_size,
    "num_layers": num_layers}

model = TimeSeriesModel(**params)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr= 3e-4, amsgrad= False)

In [None]:
model = model.to(device)
best_val_loss = torch.inf

pbar = tqdm(range(epochs), desc= "Epoch")
for epoch in pbar:
    model.train()
    for ind, (X_train_bt, y_train_bt) in enumerate(train_loader):
        X_train_bt = X_train_bt.to(device)
        y_train_bt = y_train_bt.to(device)

        def closure():
            optimizer.zero_grad()
            train_preds = model(X_train_bt)
            train_loss = criterion(train_preds, y_train_bt) ** 0.5
            train_loss.backward()
            return train_loss

        optimizer.step(closure)

    val_prediction_full = None
    val_y_full = None

    model.eval()
    with torch.inference_mode():
        for ind, (X_val_bt, y_val_bt) in enumerate(val_loader):
            X_val_bt = X_val_bt.to(device)
            y_val_bt = y_val_bt.to(device)

            val_preds = model(X_val_bt)

            if val_prediction_full is None:
                val_prediction_full = val_preds.detach().cpu().numpy()
                val_y_full = y_val_bt.detach().cpu().numpy()
            else:
                val_prediction_full = np.concatenate([val_prediction_full, val_preds.detach().cpu().numpy()], axis= 0)
                val_y_full = np.concatenate([val_y_full, y_val_bt.detach().cpu().numpy()], axis= 0)

    val_prediction_full = torch.from_numpy(val_prediction_full)
    val_y_full = torch.from_numpy(val_y_full)

    avg_val_loss = criterion(val_prediction_full, val_y_full) ** 0.5

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pt")

    pbar.set_description(f"Epoch: {epoch + 1}/{epochs} Val: {avg_val_loss:.5f} Best val: {best_val_loss:.5f}")


Epoch:   0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
model = TimeSeriesModel(**params)
model.load_state_dict(torch.load("best_model.pt"))

In [None]:
test_prediction_full = None
test_y_full = None

model.eval()
with torch.inference_mode():
    test_pbar = tqdm(enumerate(test_loader), desc= "Test batch", total= len(test_loader))
    for ind, (X_test_bt, y_test_bt) in test_pbar:
        X_test_bt = X_test_bt.to(device)
        y_test_bt = y_test_bt.to(device)

        test_preds = model(X_test_bt)

        if test_prediction_full is None:
            test_prediction_full = test_preds.detach().cpu().numpy()
            test_y_full = y_test_bt.detach().cpu().numpy()
        else:
            test_prediction_full = np.concatenate([test_prediction_full, test_preds.detach().cpu().numpy()], axis= 0)
            test_y_full = np.concatenate([test_y_full, y_test_bt.detach().cpu().numpy()], axis= 0)

In [None]:
def wape(y_hat, y):
    return np.sum(np.abs(y - y_hat), axis= 0) / np.sum(np.abs(y), axis= 0) * 100

wape_error = wape(test_prediction_full, test_y_full)
print(f"Test WAPE error: {wape_error}%")

In [None]:
data.columns