In [1]:
import os
import sys

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

#TODO: Why is this needed? os.environ['PYTHONPATH'] = module_path
module_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if module_path not in sys.path:
    os.environ['PYTHONPATH'] = module_path
    sys.path.append(module_path)

from src.train import train_model

In [2]:
target_variable = 'Flow_Kalltveit'
file_name = "cleaned_data_1.csv"
data_dir = "../data"
datetime_variable = "Datetime"
models = ["LSTM", "LSTMTemporalAttention", "LSTMSpatialTemporalAttention"] # Can be: "FCN", "FCNTemporalAttention", "LSTMTemporalAttention", "LSTM", "LSTMSpatialAttention", "LSTMSpatialTemporalAttention"
variables = [
            [],
            ["Wind_Speed_Nilsebu", "Air_Temperature_Nilsebu", "Wind_Direction_Nilsebu", "Relative_Humidity_Nilsebu", "Air_Temperature_Fister", "Precipitation_Fister", "Flow_Lyngsvatn_Overflow", "Flow_Tapping", "Water_Level_Kalltveit", "Water_Temperature_Kalltveit_Kum", "Precipitation_Nilsebu", "Flow_HBV", "Precipitation_HBV", "Temperature_HBV", "Flow_Without_Tapping_Kalltveit", "Flow_Lyngsaana", "Water_Temperature_Lyngsaana"],
        ]

In [3]:
config = {
    "data_file": file_name,
    "datetime":  datetime_variable,
    
    "data": {
        "target_variable": target_variable,
        "sequence_length": tune.choice([25]),
        "batch_size": tune.choice([256, 512]),
        "variables": tune.grid_search(variables)
    },

    "model": tune.grid_search(models), 
    "model_arch": {
        "input_size": None,
        "hidden_size": tune.choice([32, 64]),
        'num_layers': tune.choice([2, 3, 4]),
        "output_size": 1
    },

    "training": {
        "learning_rate": tune.loguniform(1e-4, 1e-1),
        "weight_decay": tune.choice([0, 0.001, 0.0001]),
    },

    'num_epochs': tune.choice([30]),
}

scheduler = ASHAScheduler( # TODO: Find a scheduler that works better
    metric="val_loss",
    mode="min",
    max_t=100,
    grace_period=5,
    reduction_factor=2
)

reporter = tune.JupyterNotebookReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "num_epochs": "num_epochs"
        },
        metric_columns=[
            "train_loss", "val_loss", "test_loss", "training_iteration"
        ])

analysis = tune.run(
    train_model, # TODO: partial(train_cifar, data_dir=data_dir),
    resources_per_trial={"cpu": 12, "gpu": 1},
    config=config,
    num_samples=1,
    #scheduler=scheduler,
    progress_reporter=reporter,
    name="inflow_forecasting",
    
)#time_total_s require_attrs=False,

0,1
Current time:,2023-04-03 15:29:11
Running for:,00:01:40.83
Memory:,17.2/31.9 GiB

Trial name,status,loc,w_decay,lr,num_epochs,train_loss,val_loss,test_loss,training_iteration
train_model_48c40_00002,RUNNING,127.0.0.1:33200,,,30,3.2135,1.46142,0.0,25.0
train_model_48c40_00003,PENDING,,,,30,,,,
train_model_48c40_00004,PENDING,,,,30,,,,
train_model_48c40_00005,PENDING,,,,30,,,,
train_model_48c40_00000,TERMINATED,127.0.0.1:31304,,,30,3.14709,0.657771,1.1378,30.0
train_model_48c40_00001,TERMINATED,127.0.0.1:26016,,,30,18.2626,10.2031,2.06342,30.0


2023-04-03 15:27:29,033	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

  0%|          | 0/178 [00:00<?, ?it/s]
  1%|          | 1/178 [00:00<00:47,  3.71it/s]
 13%|█▎        | 24/178 [00:00<00:01, 80.83it/s]
 27%|██▋       | 48/178 [00:00<00:00, 131.77it/s]
 40%|███▉      | 71/178 [00:00<00:00, 162.70it/s]
 53%|█████▎    | 95/178 [00:00<00:00, 185.60it/s]
 67%|██████▋   | 119/178 [00:00<00:00, 200.76it/s]
 80%|███████▉  | 142/178 [00:00<00:00, 209.54it/s]
 93%|█████████▎| 165/178 [00:01<00:00, 195.37it/s]
100%|██████████| 178/178 [00:01<00:00, 152.88it/s]
  0%|          | 0/45 [00:00<?, ?it/s]


Trial name,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,test_loss,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,training_iteration,trial_id,val_loss,warmup_time
train_model_48c40_00000,2023-04-03_15-27-43,False,,e80fa45e92b348f0af87e8a9a01fe2ee,DESKTOP-D4IVECG,7,127.0.0.1,31304,True,0,9.66531,1.00763,9.66531,1680528463,0,,9.2766,7,48c40_00000,4.63035,0.00488329


100%|██████████| 45/45 [00:00<00:00, 426.57it/s]
  0%|          | 0/178 [00:00<?, ?it/s]
 11%|█         | 20/178 [00:00<00:00, 199.71it/s]
 24%|██▎       | 42/178 [00:00<00:00, 209.04it/s]
 37%|███▋      | 65/178 [00:00<00:00, 216.85it/s]
 49%|████▉     | 87/178 [00:00<00:00, 205.95it/s]
 61%|██████    | 109/178 [00:00<00:00, 209.25it/s]
 75%|███████▍  | 133/178 [00:00<00:00, 216.61it/s]
 88%|████████▊ | 157/178 [00:00<00:00, 221.95it/s]
100%|██████████| 178/178 [00:00<00:00, 217.28it/s]
  0%|          | 0/45 [00:00<?, ?it/s]
100%|██████████| 45/45 [00:00<00:00, 433.66it/s]
  0%|          | 0/178 [00:00<?, ?it/s]
 13%|█▎        | 24/178 [00:00<00:00, 231.14it/s]
 27%|██▋       | 48/178 [00:00<00:00, 234.76it/s]
 40%|████      | 72/178 [00:00<00:00, 233.13it/s]
 54%|█████▍    | 97/178 [00:00<00:00, 237.57it/s]
 82%|████████▏ | 146/178 [00:00<00:00, 239.93it/s]
 96%|█████████▌| 171/178 [00:00<00:00, 240.00it/s]
100%|██████████| 178/178 [00:00<00:00, 237.39it/s]
  0%|          | 0/45 [00:

In [4]:
print("Best config: ", analysis.get_best_config(
   metric="val_loss", mode="min"))
# Get a dataframe for analyzing trial results.
df = analysis.results_df

Best config:  {'data_file': 'cleaned_data_1.csv', 'datetime': 'Datetime', 'data': {'target_variable': 'Flow_Kalltveit', 'sequence_length': 25, 'batch_size': 256, 'variables': []}, 'model': 'LSTM', 'model_arch': {'input_size': None, 'hidden_size': 32, 'num_layers': 4, 'output_size': 1}, 'training': {'learning_rate': 0.002820517939530118, 'weight_decay': 0}, 'num_epochs': 30}


In [5]:
df[['config/model', 'train_loss', 'val_loss', 'test_loss', 'time_total_s', 'config/data/variables']].sort_values('test_loss')

Unnamed: 0_level_0,config/model,train_loss,val_loss,test_loss,time_total_s,config/data/variables
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
48c40_00005,LSTMSpatialTemporalAttention,0.503202,4.441513,0.610043,31.176463,"[Wind_Speed_Nilsebu, Air_Temperature_Nilsebu, ..."
48c40_00002,LSTMTemporalAttention,3.276931,0.924468,1.006252,33.128089,[]
48c40_00000,LSTM,3.147088,0.657771,1.137797,31.149705,[]
48c40_00001,LSTM,18.262605,10.203101,2.063421,29.478042,"[Wind_Speed_Nilsebu, Air_Temperature_Nilsebu, ..."
48c40_00003,LSTMTemporalAttention,44.775995,29.603649,6.140944,25.670277,"[Wind_Speed_Nilsebu, Air_Temperature_Nilsebu, ..."
48c40_00004,LSTMSpatialTemporalAttention,58.546838,56.685347,52.829769,34.561135,[]
