In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import pandas as pd

from ray import tune
from ray.air import session
from ray.tune.schedulers import ASHAScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

from pathlib import Path

from src.models import *
from src.data import Data
from src.train import fit

In [2]:
target_variable = 'Flow_Kalltveit'
file_name = "cleaned_data_1.csv"
data_dir = "../data"
datetime_variable = "Datetime"

In [3]:
def train_model(config, checkpoint_dir=None):
    use_GPU = torch.cuda.is_available()
    if use_GPU:
        mode = {"name": "cuda", "device": torch.device("cuda")}
    else:
        mode = {"name": "cpu", "device": torch.device("cpu")}

    # Define hyperparameters
    train_size = 0.7
    val_size = 0.2
    test_size = 0.1

    sequence_length = config['sequence_length']
    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    lr = config['learning_rate']
    weight_decay = config['weigth_decay']

    # Set data file
    data_file = config['data_file']
    datetime_variable = config['datetime']

    data = Data(data_file, datetime_variable)

    # Select variables to use
    vars = config['variables']
    target_variable = config['target_variable']
    X, y = data.data_transformation(sequence_length=sequence_length, target_variable=target_variable, columns_to_transformation=vars)

    # Split the data
    X_train, y_train, X_val, y_val, X_test, y_test = data.split_data(X, y, train_size=train_size, val_size=val_size, test_size=test_size)
    train_dataloader = data.create_dataloader(X_train, y_train, sequence_length, batch_size=batch_size, shuffle=True)
    val_dataloader = data.create_dataloader(X_val, y_val, sequence_length, batch_size=batch_size, shuffle=False)
    test_dataloader = data.create_dataloader(X_test, y_test, sequence_length, batch_size=batch_size, shuffle=False)

    # Model inputs
    if vars:
        input_size = len(vars) + 1
    else:
        input_size = 1
    hidden_size = config['hidden_size']
    num_layers = config['num_layers']
    output_size = 1

    if config['arch'] == "FCN":
        net = FCN(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] ==  "FCNTemporalAttention":
        net = FCNTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTM":
        net = LSTM(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMTemporalAttention":
        net = LSTMTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMSpatialAttention":
        net = LSTMSpatialAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMSpatialTemporalAttention":
        net = LSTMSpatialTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )

    data_loader = {
    "train": train_dataloader,
    "val": val_dataloader,
    "test": test_dataloader,
    }
    
    net.to(mode["device"])

    loss_function = nn.MSELoss().to(mode["device"])
    optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    fit(net, loss_function, optimizer, data_loader, num_epochs, mode, checkpoint_dir, use_amp=True)

In [4]:
# Because I do report validation loss first
os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"

In [5]:
from functools import partial
from ray.tune.schedulers import PopulationBasedTraining


config = {
    "data_file": file_name,
    "datetime":  datetime_variable,
    "target_variable": target_variable,
    "arch": tune.grid_search(["LSTM"]), # "FCN", "FCNTemporalAttention", "LSTMTemporalAttention", "LSTM", "LSTMSpatialAttention", "LSTMSpatialTemporalAttention"
    "sequence_length": tune.choice([25]),
    'num_epochs': tune.choice([30]),
    'num_layers': tune.choice([2, 3, 4]),
    "learning_rate": tune.loguniform(1e-4, 1e-1),
    "weigth_decay": tune.choice([0, 0.001, 0.0001]),
    "batch_size": tune.choice([256, 512]),
    "hidden_size": tune.choice([32, 64]),
    "variables": tune.grid_search([
        None,
    ])
}


"""scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="val_loss",
    mode="min",
    perturbation_interval=2,
    # Because I do report validation loss first
    require_attrs=False,
    hyperparam_mutations={
        "weigth_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
    },
    )"""


scheduler = ASHAScheduler(
    metric="val_loss",
    mode="min",
    max_t=100,
    grace_period=5,
    reduction_factor=2
)

reporter = tune.JupyterNotebookReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "num_epochs": "num_epochs"
        },
        metric_columns=[
            "train_loss", "val_loss", "test_loss", "epoch", "training_iteration"
        ])

analysis = tune.run(
    train_model, # TODO: partial(train_cifar, data_dir=data_dir),
    resources_per_trial={"cpu": 12, "gpu": 1},
    config=config,
    num_samples=1,
    scheduler=scheduler,
    progress_reporter=reporter,
    name="inflow_forecasting",
    verbose=True
)

0,1
Current time:,2023-04-02 18:03:27
Running for:,00:00:27.42
Memory:,12.0/31.9 GiB

Trial name,status,loc,w_decay,lr,num_epochs,test_loss,training_iteration
train_model_d71a9_00000,TERMINATED,127.0.0.1:29252,,0.00017598,30,22.8667,61


2023-04-02 18:02:58,472	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

  0%|          | 0/89 [00:00<?, ?it/s]
  1%|          | 1/89 [00:00<00:40,  2.19it/s]
 21%|██▏       | 19/89 [00:00<00:01, 44.31it/s]
 42%|████▏     | 37/89 [00:00<00:00, 78.35it/s]
100%|██████████| 89/89 [00:01<00:00, 83.72it/s] 
  0%|          | 0/23 [00:00<?, ?it/s]
100%|██████████| 23/23 [00:00<00:00, 261.02it/s]
  0%|          | 0/89 [00:00<?, ?it/s]
 18%|█▊        | 16/89 [00:00<00:00, 152.95it/s]
 37%|███▋      | 33/89 [00:00<00:00, 160.77it/s]
 57%|█████▋    | 51/89 [00:00<00:00, 165.40it/s]
 76%|███████▋  | 68/89 [00:00<00:00, 166.77it/s]
100%|██████████| 89/89 [00:00<00:00, 166.39it/s]
  0%|          | 0/23 [00:00<?, ?it/s]
100%|██████████| 23/23 [00:00<00:00, 278.52it/s]
  0%|          | 0

In [6]:
print("Best config: ", analysis.get_best_config(
   metric="val_loss", mode="min"))
# Get a dataframe for analyzing trial results.
df = analysis.results_df

Best config:  {'data_file': 'cleaned_data_1.csv', 'datetime': 'Datetime', 'target_variable': 'Flow_Kalltveit', 'arch': 'LSTM', 'sequence_length': 25, 'num_epochs': 30, 'num_layers': 4, 'learning_rate': 0.00017598044440355847, 'weigth_decay': 0, 'batch_size': 512, 'hidden_size': 32, 'variables': None}


100%|██████████| 13/13 [00:00<00:00, 267.77it/s]


In [7]:
df

Unnamed: 0_level_0,test_loss,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,...,config/target_variable,config/arch,config/sequence_length,config/num_epochs,config/num_layers,config/learning_rate,config/weigth_decay,config/batch_size,config/hidden_size,config/variables
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d71a9_00000,22.866741,0.051553,True,,,61,551d462ccca3435e9d1a005c4803b710,2023-04-02_18-03-27,1680451407,23.883509,...,Flow_Kalltveit,LSTM,25,30,4,0.000176,0,512,32,


In [8]:
"""
    "model": {
        "hidden_size": tune.choice([32, 64, 128]),
        "num_layers": tune.choice([1, 2, 3]),
    },

    model = YourModel(**config["model"]).to(device)
"""

'\n    "model": {\n        "hidden_size": tune.choice([32, 64, 128]),\n        "num_layers": tune.choice([1, 2, 3]),\n    },\n\n    model = YourModel(**config["model"]).to(device)\n'