In [85]:
import config.load_data as load_data
from models import model

from tqdm import tqdm
import os.path
import sys
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from ray import tune
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.tune.schedulers import ASHAScheduler

import mlflow
from mlflow.tracking import MlflowClient

#### Creating Mlflow experiment

In [86]:
client = MlflowClient()
cwd = os.getcwd()
exp_base_name = "All_models"

created = 0
for i in range(100):
    try:
        exp_name = exp_base_name+"_{}".format(i)
        experiment_id = client.create_experiment(exp_name)
        created=1
        break
    except (TypeError, mlflow.exceptions.MlflowException):
        continue

if not created:
    print("ERROR: Try new experiment name.")
    sys.exit(1)

weights_root = "./model_weights/"
weights_dir = weights_root+exp_name+'/'
os.mkdir(weights_dir)

#### Data preparation

In [87]:
from sklearn.model_selection import train_test_split

def split_data(data, shuffle_train=False, train_size=0.7, test_size=0.3):
    train, temp = train_test_split(data, train_size=train_size, shuffle=shuffle_train)
    val, test = train_test_split(temp, test_size=test_size, shuffle=False)
    return train, val, test

def sliding_windows(data, seq_length):
    x = []
    y = []
    for i in range(len(data)-seq_length-1):
        _x = data[i:(i+seq_length)]
        _y = data[i+seq_length]
        x.append(_x)
        y.append(_y)
    return torch.from_numpy(np.array(x).reshape(len(x), -1)).float(), torch.from_numpy(np.array(y)).float()


In [88]:
ld = load_data()
    
data = ld.get_univariate_data()
train, val, test = split_data(data)

In [89]:
def get_lagged_data(sequence_length, pca):
    X_train, y_train = sliding_windows(train.values, sequence_length)
    X_val, y_val = sliding_windows(val.values, sequence_length)
    X_test, y_test = sliding_windows(test.values, sequence_length)

    if pca:
        sc = StandardScaler()

        X_tran_train = sc.fit_transform(X_train)
        X_tran_val = sc.transform(X_val)
        X_tran_test = sc.transform(X_test)

        pca = PCA(n_components = 0.95)
        X_reduced_train = pca.fit_transform(X_tran_train)
        X_reduced_val = pca.transform(X_tran_val)
        X_reduced_test = pca.transform(X_tran_test)

        k = len(np.cumsum(pca.explained_variance_ratio_*100))

        X_train = X_reduced_train[:, :k]
        X_val = X_reduced_val[:, :k]
        X_test = X_reduced_test[:, :k]

    return TensorDataset(torch.tensor(X_train), y_train), TensorDataset(torch.tensor(X_val), y_val), TensorDataset(torch.tensor(X_test), y_test)

#### Training and testing method

In [90]:
def fit(net, loss_function, optimizer, data_loader, num_epochs, mode, use_amp=False):
	history = {"train": {"loss": [], "mae": []}, "val": {"loss": [], "mae": []}}
	scaler = torch.cuda.amp.GradScaler(enabled=use_amp) # Mixed-precision support for compatible GPUs
	print("\nTraining the model:")
	for epoch in range(num_epochs):
		print("\nEpoch", epoch+1)
		if epoch < num_epochs - 1:
			keys = ["train", "val"]
		else:
			keys = ["train", "val", "test"]
		for key in keys:
			dataset_size = 0
			dataset_loss = 0.0
			if key == "train":
				net.train()
			else:
				net.eval()
			for X_batch, y_batch in tqdm(data_loader[key]):
				X_batch, y_batch = X_batch.to(mode["device"]), y_batch.to(mode["device"])
				with torch.set_grad_enabled(mode=(key=="train")): # Autograd activated only during training
					with torch.cuda.amp.autocast(enabled=use_amp): # Mixed-precision support for compatible GPUs
						batch_output = net(X_batch.float())
						batch_loss = loss_function(batch_output, y_batch)
					if key == "train":
						scaler.scale(batch_loss).backward()
						scaler.step(optimizer) 	
						scaler.update()
						optimizer.zero_grad()
				dataset_size += y_batch.shape[0]
				dataset_loss += y_batch.shape[0] * batch_loss.item()
			dataset_loss /= dataset_size
			if key in ["train", "val"]:
				history[key]["loss"].append(dataset_loss)
				if key == "train":
					tune.report(train_loss=dataset_loss)
				else:
					tune.report(val_loss=dataset_loss)
			else:
				print("\nEvaluating the model:")
			print(key, "loss:", dataset_loss)
			tune.report(test_loss=dataset_loss)
	return net

In [91]:
def train_model(config):
    use_GPU = torch.cuda.is_available()
    if use_GPU:
        mode = {"name": "cuda", "device": torch.device("cuda")}
    else:
        mode = {"name": "cpu", "device": torch.device("cpu")}

    num_epochs = 30
    batch_size = 128*4 #config['batch_size']
    
    lr = config['lr']
    
    pca = 0
    in_dim = 72
    sequence_length = 6

    train_, val_, test_ = get_lagged_data(in_dim, pca)

    lstm_in_dim = 12 # int(in_dim/sequence_length)
    lstm_hidden_dim = 64 #config['hidden_dim']
    out_dim = 1

    train_dataloader = torch.utils.data.DataLoader(train_,
                                           batch_size = batch_size,
                                           shuffle = True)
    val_dataloader = torch.utils.data.DataLoader(val_,
                                            batch_size = batch_size,
                                            shuffle = False)
    test_dataloader = torch.utils.data.DataLoader(test_,
                                            batch_size = batch_size,
                                            shuffle = False)

    data_loader = {
    "train": train_dataloader,
    "val": val_dataloader,
    "test": test_dataloader,
    }

    if config['arch'] == "FCN":
        net = model.FCN(in_dim,
                        sequence_length,
                        lstm_in_dim,
                        lstm_hidden_dim,
                        out_dim,
                        mode,)
    elif config['arch'] == "LSTM":
        net = model.LSTM(in_dim,
                        sequence_length,
                        lstm_in_dim,
                        lstm_hidden_dim,
                        out_dim,
                        mode,)
    elif config['arch'] == "TA_LSTM":
        net = model.TA_LSTM(in_dim,
                        sequence_length,
                        lstm_in_dim,
                        lstm_hidden_dim,
                        out_dim,
                        mode,) 
    
    net.to(mode["device"])

    loss_function = nn.MSELoss().to(mode["device"])
    optimizer = optim.Adam(net.parameters(), lr=lr)
                                           
    best_trained_model = fit(net, loss_function, optimizer, data_loader, num_epochs, mode)
    out_name = ""
    for k, v in config.items():
        if not k in ['weights_dir', 'cwd']:
            out_name += '{}-{}_'.format(k, v)
    torch.save(best_trained_model.state_dict(), os.path.join(config['cwd'], config['weights_dir'], out_name[:-1] + '.pth'))


#### Hyperparameter tuning with ray tuning

In [92]:
import os
os.environ['TUNE_DISABLE_STRICT_METRIC_CHECKING'] = "1"

In [93]:
config = {
    "mlflow_experiment_id": experiment_id,
    "weights_dir": weights_dir,
    "cwd": cwd,
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([128*2, 128*3, 128*4]),
    #"pca": tune.grid_search(['True', 'False']),
    #"in_dim": tune.grid_search([24, 24*2, 24*3]),
    "arch": tune.grid_search(["FCN", "LSTM", "TA_LSTM"]),
    "hidden_dim": tune.choice([32, 64, 128])
}
"""scheduler = ASHAScheduler(
        metric='val_loss',
        mode="min",
        max_t=100,
        grace_period=1,
        reduction_factor=2,
)"""

analysis = tune.run(
    train_model,
    config=config,
    resources_per_trial={"cpu": 12, "gpu": 1},
    num_samples=2,
    # scheduler=scheduler,
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name)],
)

0,1
Current time:,2023-02-15 02:09:39
Running for:,00:19:29.92
Memory:,14.9/31.9 GiB

Trial name,status,loc,arch,batch_size,hidden_dim,lr,iter,total time (s),test_loss
train_model_b2b86_00008,RUNNING,127.0.0.1:24328,TA_LSTM,128,32,0.000459493,,,
train_model_b2b86_00009,PENDING,,FCN,256,64,0.000688196,,,
train_model_b2b86_00010,PENDING,,LSTM,256,32,0.0278487,,,
train_model_b2b86_00011,PENDING,,TA_LSTM,256,128,0.00102989,,,
train_model_b2b86_00012,PENDING,,FCN,384,128,0.000227072,,,
train_model_b2b86_00013,PENDING,,LSTM,384,32,0.00204117,,,
train_model_b2b86_00014,PENDING,,TA_LSTM,384,128,0.00554953,,,
train_model_b2b86_00015,PENDING,,FCN,512,64,0.00440045,,,
train_model_b2b86_00016,PENDING,,LSTM,512,32,0.00427039,,,
train_model_b2b86_00017,PENDING,,TA_LSTM,512,32,0.0787955,,,
