In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import pandas as pd

from ray import tune
from ray.air import session
from ray.tune.schedulers import ASHAScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

from pathlib import Path

from src.models import *
from src.data import Data
from src.train import fit

In [2]:
target_variable = 'Flow_Kalltveit'
file_name = "cleaned_data_1.csv"
data_dir = "../data"
datetime_variable = "Datetime"

In [3]:
def train_model(config, checkpoint_dir=None):
    use_GPU = torch.cuda.is_available()
    if use_GPU:
        mode = {"name": "cuda", "device": torch.device("cuda")}
    else:
        mode = {"name": "cpu", "device": torch.device("cpu")}

    # Define hyperparameters
    train_size = 0.7
    val_size = 0.2
    test_size = 0.1

    sequence_length = config['sequence_length']
    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    lr = config['learning_rate']
    weight_decay = config['weigth_decay']

    # Set data file
    data_file = config['data_file']
    datetime_variable = config['datetime']

    data = Data(data_file, datetime_variable)

    # Select variables to use
    vars = config['variables']
    target_variable = config['target_variable']
    X, y = data.data_transformation(sequence_length=sequence_length, target_variable=target_variable, columns_to_transformation=vars)

    # Split the data
    X_train, y_train, X_val, y_val, X_test, y_test = data.split_data(X, y, train_size=train_size, val_size=val_size, test_size=test_size)
    train_dataloader = data.create_dataloader(X_train, y_train, sequence_length, batch_size=batch_size, shuffle=True)
    val_dataloader = data.create_dataloader(X_val, y_val, sequence_length, batch_size=batch_size, shuffle=False)
    test_dataloader = data.create_dataloader(X_test, y_test, sequence_length, batch_size=batch_size, shuffle=False)

    # Model inputs
    if vars:
        input_size = len(vars) + 1
    else:
        input_size = 1
    hidden_size = config['hidden_size']
    num_layers = config['num_layers']
    output_size = 1

    if config['arch'] == "FCN":
        net = FCN(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] ==  "FCNTemporalAttention":
        net = FCNTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTM":
        net = LSTM(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMTemporalAttention":
        net = LSTMTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMSpatialAttention":
        net = LSTMSpatialAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMSpatialTemporalAttention":
        net = LSTMSpatialTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )

    data_loader = {
    "train": train_dataloader,
    "val": val_dataloader,
    "test": test_dataloader,
    }
    
    net.to(mode["device"])

    loss_function = nn.MSELoss().to(mode["device"])
    optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    fit(net, loss_function, optimizer, data_loader, num_epochs, mode, checkpoint_dir, use_amp=True)

In [4]:
from functools import partial
from ray.tune.schedulers import PopulationBasedTraining

config = {
    "data_file": file_name,
    "datetime":  datetime_variable,
    "target_variable": target_variable,
    "arch": tune.grid_search(["LSTM", "LSTMTemporalAttention", "LSTMSpatialAttention", "LSTMSpatialTemporalAttention"]), # "FCN", "FCNTemporalAttention", "LSTMTemporalAttention", "LSTM", "LSTMSpatialAttention", "LSTMSpatialTemporalAttention"
    "sequence_length": tune.choice([25]),
    'num_epochs': tune.choice([30]),
    'num_layers': tune.choice([2, 3, 4]),
    "learning_rate": tune.loguniform(1e-4, 1e-1),
    "weigth_decay": tune.choice([0, 0.001, 0.0001]),
    "batch_size": tune.choice([256, 512]),
    "hidden_size": tune.choice([32, 64]),
    "variables": tune.grid_search([
        None,
        ["Wind_Speed_Nilsebu", "Air_Temperature_Nilsebu", "Wind_Direction_Nilsebu", "Relative_Humidity_Nilsebu", "Air_Temperature_Fister", "Precipitation_Fister", "Flow_Lyngsvatn_Overflow", "Flow_Tapping", "Water_Level_Kalltveit", "Water_Temperature_Kalltveit_Kum", "Precipitation_Nilsebu", "Flow_HBV", "Precipitation_HBV", "Temperature_HBV", "Flow_Without_Tapping_Kalltveit", "Flow_Lyngsaana", "Water_Temperature_Lyngsaana"],
    ])
}
scheduler = ASHAScheduler( # TODO: Find a scheduler that works better
    metric="val_loss",
    mode="min",
    max_t=100,
    grace_period=5,
    reduction_factor=2
)

reporter = tune.JupyterNotebookReporter(
        parameter_columns={
            "weigth_decay": "w_decay",
            "learning_rate": "lr",
            "num_epochs": "num_epochs"
        },
        metric_columns=[
            "train_loss", "val_loss", "test_loss", "training_iteration"
        ])

analysis = tune.run(
    train_model, # TODO: partial(train_cifar, data_dir=data_dir),
    resources_per_trial={"cpu": 12, "gpu": 1},
    config=config,
    num_samples=1,
    #scheduler=scheduler,
    progress_reporter=reporter,
    name="inflow_forecasting",
    
)#time_total_s require_attrs=False,

0,1
Current time:,2023-04-02 20:17:28
Running for:,00:00:08.57
Memory:,13.3/31.9 GiB

Trial name,status,loc,w_decay,lr,num_epochs,train_loss,val_loss,test_loss,training_iteration
train_model_9b4cb_00000,RUNNING,127.0.0.1:6140,0.001,0.00749158,30,45.4932,15.6453,0.0,1.0
train_model_9b4cb_00001,PENDING,,0.0,0.000141852,30,,,,
train_model_9b4cb_00002,PENDING,,0.0001,0.000529995,30,,,,
train_model_9b4cb_00003,PENDING,,0.001,0.00944205,30,,,,
train_model_9b4cb_00004,PENDING,,0.0,0.0880001,30,,,,
train_model_9b4cb_00005,PENDING,,0.001,0.000547444,30,,,,
train_model_9b4cb_00006,PENDING,,0.0,0.0101135,30,,,,
train_model_9b4cb_00007,PENDING,,0.001,0.00358019,30,,,,
train_model_9b4cb_00008,PENDING,,0.0,0.0205189,30,,,,
train_model_9b4cb_00009,PENDING,,0.0001,0.0652741,30,,,,


2023-04-02 20:17:18,485	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

  0%|          | 0/178 [00:00<?, ?it/s]
  1%|          | 1/178 [00:00<00:53,  3.28it/s]
 10%|█         | 18/178 [00:00<00:02, 55.27it/s]
 18%|█▊        | 32/178 [00:00<00:01, 81.27it/s]
 28%|██▊       | 50/178 [00:00<00:01, 110.61it/s]
 38%|███▊      | 67/178 [00:00<00:00, 128.20it/s]
 48%|████▊     | 86/178 [00:00<00:00, 145.97it/s]
 60%|█████▉    | 106/178 [00:00<00:00, 160.42it/s]
 71%|███████   | 126/178 [00:01<00:00, 169.19it/s]
 83%|████████▎ | 147/178 [00:01<00:00, 178.88it/s]
 94%|█████████▍| 168/178 [00:01<00:00, 186.68it/s]
100%|██████████| 178/178 [00:01<00:00, 128.93it/s]
  0%|          | 0/45 [00:00<?, ?it/s]


2023-04-02 20:17:28,595	ERROR tune.py:794 -- Trials did not complete: [train_model_9b4cb_00000, train_model_9b4cb_00001, train_model_9b4cb_00002, train_model_9b4cb_00003, train_model_9b4cb_00004, train_model_9b4cb_00005, train_model_9b4cb_00006, train_model_9b4cb_00007, train_model_9b4cb_00008, train_model_9b4cb_00009, train_model_9b4cb_00010, train_model_9b4cb_00011]
2023-04-02 20:17:28,595	INFO tune.py:798 -- Total run time: 8.62 seconds (8.56 seconds for the tuning loop).


100%|██████████| 45/45 [00:00<00:00, 254.11it/s]
  0%|          | 0/178 [00:00<?, ?it/s]
  9%|▉         | 16/178 [00:00<00:01, 157.86it/s]


In [None]:
print("Best config: ", analysis.get_best_config(
   metric="val_loss", mode="min"))
# Get a dataframe for analyzing trial results.
df = analysis.results_df

NameError: name 'analysis' is not defined

In [None]:
df

Unnamed: 0_level_0,train_loss,val_loss,test_loss,time_this_iter_s,should_checkpoint,done,timesteps_total,episodes_total,training_iteration,experiment_id,...,config/target_variable,config/arch,config/sequence_length,config/num_epochs,config/num_layers,config/learning_rate,config/weigth_decay,config/batch_size,config/hidden_size,config/variables
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4590b_00000,20.018083,9.753485,13.834575,1.010038,True,True,,,30,885b61288c2d421b97179ed96ab87ec0,...,Flow_Kalltveit,LSTM,25,30,3,0.077368,0.0001,256,32,
4590b_00001,8.418127,1.8867,5.245848,1.107213,True,True,,,30,ab9b712dfee54d7b90d29f5bbddd1f24,...,Flow_Kalltveit,LSTMTemporalAttention,25,30,2,0.000381,0.0,256,32,
4590b_00002,1.208316,0.276047,0.378798,1.316164,True,True,,,30,8fd698226f1e470d87ba22256b6d9ffa,...,Flow_Kalltveit,LSTM,25,30,4,0.000599,0.0001,256,64,"[Air_Temperature_Fister, Precipitation_Fister]"
4590b_00003,1.032035,0.958731,0.712596,1.257438,True,True,,,30,d999283521424638a4414d1069294d0c,...,Flow_Kalltveit,LSTMTemporalAttention,25,30,4,0.007805,0.001,256,32,"[Air_Temperature_Fister, Precipitation_Fister]"


In [None]:
"""
    "model": {
        "hidden_size": tune.choice([32, 64, 128]),
        "num_layers": tune.choice([1, 2, 3]),
    },

    model = YourModel(**config["model"]).to(device)
"""

"""
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="val_loss",
    mode="min",
    perturbation_interval=2,
    hyperparam_mutations={
        "weigth_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
    },
)
"""


'\nscheduler = PopulationBasedTraining(\n    time_attr="training_iteration",\n    metric="val_loss",\n    mode="min",\n    perturbation_interval=2,\n    hyperparam_mutations={\n        "weigth_decay": tune.uniform(0.0, 0.3),\n        "learning_rate": tune.uniform(1e-5, 5e-5),\n    },\n)\n'