In [None]:
import pandas as pd
import numpy as np
import os
from dataset_utils import split_data, TimeSeriesDataset, min_max_normalization
from model.lstm import LSTMMdel
from model.nlinear import NLinear
from model.SegRNN import SegRNN
from model.PatchTST import PatchTST,Config
from train_utils import Trainer
from evaluate_utils import Evaluator
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import json
from json_utils import NpEncoder
from typing import List, Tuple
from sklearn.preprocessing import MinMaxScaler

In [None]:
DATASET_PATH = "../../dataset"
INDEX_FIELD = "timestamp"
DATA_FIELD = "num_request"
RESULT_ROOT_PATH="results"
MODEL_NAME="lstm"
# MODEL_NAME="nlinear"
# MODEL_NAME="segrnn"
# MODEL_NAME="patchtst"

N_LOOKBACK = 4
N_PREDICT = 2
N_EPOCHS = 40
BATCH_SIZE=16
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SCHEDULER_MILESTONE = [30, 35]
SCHEDULER_GAMMA = 0.3

In [None]:
def get_data_file_list(dataset_path: str) -> List[str]:
    return os.listdir(dataset_path)


In [None]:
def read_dataset(csv_path: str,index_field:str,data_field:str) -> Tuple[np.ndarray, np.ndarray]:
    df = pd.read_csv(csv_path)
    return df[index_field].to_numpy(), df[data_field].to_numpy()

In [None]:
def learn_on_csv(csv_path: str) -> Tuple[Evaluator, MinMaxScaler, Evaluator, MinMaxScaler, Evaluator, MinMaxScaler]:
    np_index, np_data = read_dataset(csv_path, INDEX_FIELD, DATA_FIELD)
    np_data = np_data.reshape((-1, 1))
    train_set, val_set, test_set = split_data(np_data, 0.6, 0.2, 0.2)
    train_set, train_scaler = min_max_normalization(train_set)
    val_set, val_scaler = min_max_normalization(val_set)
    test_set, test_scaler = min_max_normalization(test_set)
    train_dataset = TimeSeriesDataset(train_set, N_LOOKBACK, N_PREDICT)
    val_dataset = TimeSeriesDataset(val_set, N_LOOKBACK, N_PREDICT)
    test_dataset = TimeSeriesDataset(test_set, N_LOOKBACK, N_PREDICT)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
    if MODEL_NAME == "lstm":
        model = LSTMMdel(1, 64, N_PREDICT, 1).to(DEVICE)
    elif MODEL_NAME == "nlinear":
        model = NLinear(N_LOOKBACK, N_PREDICT).to(DEVICE)
    elif MODEL_NAME == "segrnn":
        model = SegRNN(seq_len=N_LOOKBACK, pred_len=N_PREDICT, enc_in=1, d_model=64, dropout=0.5, rnn_type="lstm", dec_way="rmf", seg_len=1, channel_id=False, revin=False).to(DEVICE)
    elif MODEL_NAME == "patchtst":
        model = PatchTST(configs=Config(enc_in=1, seq_len=N_LOOKBACK, pred_len=N_PREDICT, e_layers=1, n_heads=4, d_model=16, d_ff=16, dropout=0.5, fc_dropout=0.5, head_dropout=0.5, individual=False, patch_len=1, stride=1, padding_patch=False, revin=False, affine=False, subtract_last=False, decomposition=False, kernel_size=1)).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, SCHEDULER_MILESTONE, SCHEDULER_GAMMA)
    lr_scheduler = None
    loss_fn = nn.MSELoss()
    trainer = Trainer(model, train_dataloader, loss_fn, optimizer, N_EPOCHS, lr_scheduler, DEVICE)
    trainer.train()
    train_evaluator = Evaluator(model, train_dataloader, loss_fn, DEVICE)
    train_evaluator.evaluate()
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    val_evaluator = Evaluator(model, val_dataloader, loss_fn, DEVICE)
    test_evaluator = Evaluator(model, test_dataloader, loss_fn, DEVICE)
    val_evaluator.evaluate()
    test_evaluator.evaluate()
    return train_evaluator, train_scaler, val_evaluator, val_scaler, test_evaluator, test_scaler

In [None]:
def save_results(x: np.ndarray, file_name: str):
    save_dir = os.path.join(RESULT_ROOT_PATH, MODEL_NAME)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(os.path.join(save_dir, file_name), "w") as f:
        json.dump(x, f, indent=4, cls=NpEncoder)

In [None]:
def save_scaled_results(train_evaluator: Evaluator, val_evaluator: Evaluator,  test_evaluator: Evaluator, file_name: str):
    train_gt_scaled, train_pd_scaled, val_gt_scaled, val_pd_scaled, test_gt_scaled, test_pd_scaled = train_evaluator.get_gt(), train_evaluator.get_pd(), val_evaluator.get_gt(), val_evaluator.get_pd(), test_evaluator.get_gt(), test_evaluator.get_pd()

    save_results(train_gt_scaled, file_name.split(".")[0]+"_gt_train_scaled.json")
    save_results(train_pd_scaled, file_name.split(".")[0]+"_pd_train_scaled.json")
    save_results(val_gt_scaled, file_name.split(".")[0]+"_gt_val_scaled.json")
    save_results(val_pd_scaled, file_name.split(".")[0]+"_pd_val_scaled.json")
    save_results(test_gt_scaled, file_name.split(".")[0]+"_gt_test_scaled.json")
    save_results(test_pd_scaled, file_name.split(".")[0]+"_pd_test_scaled.json")

In [None]:
def save_original_results(train_evaluator: Evaluator, train_scaler: MinMaxScaler, val_evaluator: Evaluator, val_scaler: MinMaxScaler, test_evaluator: Evaluator, test_scaler: MinMaxScaler, file_name: str):
    train_gt_scaled, train_pd_scaled, val_gt_scaled, val_pd_scaled, test_gt_scaled, test_pd_scaled = train_evaluator.get_gt(), train_evaluator.get_pd(), val_evaluator.get_gt(), val_evaluator.get_pd(), test_evaluator.get_gt(), test_evaluator.get_pd()
    
    train_gt_original=np.hstack([train_scaler.inverse_transform(train_gt_scaled[:,i_dim]) for i_dim in range(train_gt_scaled.shape[1])])
    train_gt_original=np.expand_dims(train_gt_original,-1)
    train_pd_original=np.hstack([train_scaler.inverse_transform(train_pd_scaled[:,i_dim]) for i_dim in range(train_pd_scaled.shape[1])])
    train_pd_original=np.expand_dims(train_pd_original,-1)
    val_gt_original=np.hstack([val_scaler.inverse_transform(val_gt_scaled[:,i_dim]) for i_dim in range(val_gt_scaled.shape[1])])
    val_gt_original=np.expand_dims(val_gt_original,-1)
    val_pd_original=np.hstack([val_scaler.inverse_transform(val_pd_scaled[:,i_dim]) for i_dim in range(val_pd_scaled.shape[1])])
    val_pd_original=np.expand_dims(val_pd_original,-1)
    test_gt_original=np.hstack([test_scaler.inverse_transform(test_gt_scaled[:,i_dim]) for i_dim in range(test_gt_scaled.shape[1])])
    test_gt_original=np.expand_dims(test_gt_original,-1)
    test_pd_original=np.hstack([test_scaler.inverse_transform(test_pd_scaled[:,i_dim]) for i_dim in range(test_pd_scaled.shape[1])])
    test_pd_original=np.expand_dims(test_pd_original,-1)

    save_results(train_gt_original, file_name.split(".")[0]+"_gt_train_original.json")
    save_results(train_pd_original, file_name.split(".")[0]+"_pd_train_original.json")
    save_results(val_gt_original, file_name.split(".")[0]+"_gt_val_original.json")
    save_results(val_pd_original, file_name.split(".")[0]+"_pd_val_original.json")
    save_results(test_gt_original, file_name.split(".")[0]+"_gt_test_original.json")
    save_results(test_pd_original, file_name.split(".")[0]+"_pd_test_original.json")

In [None]:
data_file_list = get_data_file_list(DATASET_PATH)
for file_name in data_file_list:
    print("learning on %s" % (file_name))
    train_evaluator, train_scaler, val_evaluator, val_scaler, test_evaluator, test_scaler = learn_on_csv(os.path.join(DATASET_PATH, file_name))
    save_scaled_results(train_evaluator, val_evaluator, test_evaluator, file_name)
    save_original_results(train_evaluator, train_scaler, val_evaluator, val_scaler, test_evaluator, test_scaler, file_name)