In [1]:
import os
import sys

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

#TODO: Why is this needed? os.environ['PYTHONPATH'] = module_path
module_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if module_path not in sys.path:
    os.environ['PYTHONPATH'] = module_path
    sys.path.append(module_path)

from src.train import train_model

In [2]:
target_variable = 'Flow_Kalltveit'
file_name = "cleaned_data_1.csv"
data_dir = "../data"
datetime_variable = "Datetime"
models = ["LSTM", "LSTMTemporalAttention", "LSTMSpatialTemporalAttention"] # Can be: "FCN", "FCNTemporalAttention", "LSTMTemporalAttention", "LSTM", "LSTMSpatialAttention", "LSTMSpatialTemporalAttention"
variables = [
            [],
            ["Wind_Speed_Nilsebu", "Air_Temperature_Nilsebu", "Wind_Direction_Nilsebu", "Relative_Humidity_Nilsebu", "Air_Temperature_Fister", "Precipitation_Fister", "Flow_Lyngsvatn_Overflow", "Flow_Tapping", "Water_Level_Kalltveit", "Water_Temperature_Kalltveit_Kum", "Precipitation_Nilsebu", "Flow_HBV", "Precipitation_HBV", "Temperature_HBV", "Flow_Without_Tapping_Kalltveit", "Flow_Lyngsaana", "Water_Temperature_Lyngsaana"],
        ]

In [3]:
config = {
    "data_file": file_name,
    "datetime":  datetime_variable,
    
    "data": {
        "target_variable": target_variable,
        "sequence_length": tune.choice([25]),
        "batch_size": tune.choice([256, 512]),
        "variables": tune.grid_search(variables)
    },

    "model": tune.grid_search(models), 
    "model_arch": {
        "input_size": None,
        "hidden_size": tune.choice([32, 64]),
        'num_layers': tune.choice([2, 3, 4]),
        "output_size": 1
    },

    "training": {
        "learning_rate": tune.loguniform(1e-4, 1e-1),
        "weight_decay": tune.choice([0, 0.001, 0.0001]),
    },

    'num_epochs': tune.choice([30]),
}

scheduler = ASHAScheduler( # TODO: Find a scheduler that works better
    metric="val_loss",
    mode="min",
    max_t=100,
    grace_period=5,
    reduction_factor=2
)

reporter = tune.JupyterNotebookReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "num_epochs": "num_epochs"
        },
        metric_columns=[
            "train_loss", "val_loss", "test_loss", "training_iteration"
        ])

analysis = tune.run(
    train_model, # TODO: partial(train_cifar, data_dir=data_dir),
    resources_per_trial={"cpu": 12, "gpu": 1},
    config=config,
    num_samples=1,
    #scheduler=scheduler,
    progress_reporter=reporter,
    name="inflow_forecasting",
    
)#time_total_s require_attrs=False,

0,1
Current time:,2023-04-03 15:22:44
Running for:,00:02:14.44
Memory:,17.2/31.9 GiB

Trial name,# failures,error file
train_model_4e5ef_00001,1,"C:\Users\magnu\ray_results\inflow_forecasting\train_model_4e5ef_00001_1_batch_size=256,sequence_length=25,variables=Wind_Speed_Nilsebu_Air_Temperature_Nilsebu_Wind_Direction_Ni_2023-04-03_15-21-03\error.txt"
train_model_4e5ef_00003,1,"C:\Users\magnu\ray_results\inflow_forecasting\train_model_4e5ef_00003_3_batch_size=512,sequence_length=25,variables=Wind_Speed_Nilsebu_Air_Temperature_Nilsebu_Wind_Direction_Ni_2023-04-03_15-21-46\error.txt"

Trial name,status,loc,w_decay,lr,num_epochs,train_loss,val_loss,test_loss,training_iteration
train_model_4e5ef_00004,RUNNING,127.0.0.1:33924,,,30,9.75538,2.32979,0.0,27.0
train_model_4e5ef_00005,PENDING,,,,30,,,,
train_model_4e5ef_00000,TERMINATED,127.0.0.1:34104,,,30,3.21642,0.690981,1.11758,30.0
train_model_4e5ef_00002,TERMINATED,127.0.0.1:34524,,,30,5.99232,1.21717,3.74463,30.0
train_model_4e5ef_00001,ERROR,127.0.0.1:26644,,,30,,,,
train_model_4e5ef_00003,ERROR,127.0.0.1:33204,,,30,,,,


2023-04-03 15:20:28,920	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

  0%|          | 0/178 [00:00<?, ?it/s]
  1%|          | 1/178 [00:00<00:46,  3.77it/s]
 29%|██▉       | 52/178 [00:00<00:00, 143.73it/s]
 43%|████▎     | 77/178 [00:00<00:00, 176.68it/s]
 57%|█████▋    | 101/178 [00:00<00:00, 195.60it/s]
 71%|███████   | 126/178 [00:00<00:00, 209.85it/s]
 84%|████████▍ | 150/178 [00:00<00:00, 218.65it/s]
 98%|█████████▊| 175/178 [00:00<00:00, 226.32it/s]
100%|██████████| 178/178 [00:01<00:00, 164.52it/s]
  0%|          | 0/45 [00:00<?, ?it/s]


Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,test_loss,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,training_iteration,trial_id,val_loss,warmup_time
train_model_4e5ef_00000,2023-04-03_15-21-03,True,,52d96ba2d8ce4a0098155cba03f1ae58,"0_batch_size=256,sequence_length=25,variables=,model=LSTM,hidden_size=32,num_layers=3,num_epochs=30,learning_rate=0.0092,weight_decay=0.0001",DESKTOP-D4IVECG,30.0,127.0.0.1,34104,True,1.1175845822339112,29.440264463424683,0.8973898887634277,29.440264463424683,1680528063,0.0,,3.216424705628988,30.0,4e5ef_00000,0.6909809308396446,0.0040631294250488
train_model_4e5ef_00001,2023-04-03_15-21-07,,,56a7fb73b5f24b7ab4d1e620cf532808,,DESKTOP-D4IVECG,,127.0.0.1,26644,,,,,,1680528067,,,,,4e5ef_00001,,
train_model_4e5ef_00002,2023-04-03_15-21-45,True,,1ee7b01369774bb98e3862455ecfcd0a,"2_batch_size=512,sequence_length=25,variables=,model=LSTMTemporalAttention,hidden_size=32,num_layers=2,num_epochs=30,learning_rate=0.0011,weight_decay=0.0001",DESKTOP-D4IVECG,30.0,127.0.0.1,34524,True,3.7446293547050034,23.384984016418457,0.7826962471008301,23.384984016418457,1680528105,0.0,,5.992318062678855,30.0,4e5ef_00002,1.2171744972776228,0.0057837963104248
train_model_4e5ef_00003,2023-04-03_15-21-50,,,424373ce29414f88b3fcf8197e6387aa,,DESKTOP-D4IVECG,,127.0.0.1,33204,,,,,,1680528110,,,,,4e5ef_00003,,
train_model_4e5ef_00004,2023-04-03_15-22-48,True,,be9bfd218a204c969cfb98f7a60645e2,"4_batch_size=256,sequence_length=25,variables=,model=LSTMSpatialTemporalAttention,hidden_size=64,num_layers=4,num_epochs=30,learning_rate=0.0070,weight_decay=0",DESKTOP-D4IVECG,30.0,127.0.0.1,33924,True,3.823096233577135,42.66602087020874,1.3578426837921145,42.66602087020874,1680528168,0.0,,6.30001460539611,30.0,4e5ef_00004,4.967687837403983,0.0049800872802734


100%|██████████| 45/45 [00:00<00:00, 436.11it/s]
  0%|          | 0/178 [00:00<?, ?it/s]
 12%|█▏        | 21/178 [00:00<00:00, 203.44it/s]
 26%|██▌       | 46/178 [00:00<00:00, 229.75it/s]
 39%|███▉      | 70/178 [00:00<00:00, 233.11it/s]
 53%|█████▎    | 94/178 [00:00<00:00, 232.96it/s]
 66%|██████▋   | 118/178 [00:00<00:00, 228.81it/s]
 79%|███████▉  | 141/178 [00:00<00:00, 226.39it/s]
 92%|█████████▏| 164/178 [00:00<00:00, 225.21it/s]
100%|██████████| 178/178 [00:00<00:00, 226.16it/s]
  0%|          | 0/45 [00:00<?, ?it/s]
100%|██████████| 45/45 [00:00<00:00, 466.24it/s]
 13%|█▎        | 24/178 [00:00<00:00, 237.80it/s]
 28%|██▊       | 49/178 [00:00<00:00, 241.88it/s]
 42%|████▏     | 74/178 [00:00<00:00, 243.42it/s]
 56%|█████▌    | 100/178 [00:00<00:00, 247.16it/s]
 70%|███████   | 125/178 [00:00<00:00, 243.58it/s]
 84%|████████▍ | 150/178 [00:00<00:00, 244.68it/s]
100%|██████████| 178/178 [00:00<00:00, 243.49it/s]
  0%|          | 0/45 [00:00<?, ?it/s]
100%|██████████| 45/45 [00

In [None]:
print("Best config: ", analysis.get_best_config(
   metric="val_loss", mode="min"))
# Get a dataframe for analyzing trial results.
df = analysis.results_df

Best config:  {'data_file': 'cleaned_data_1.csv', 'datetime': 'Datetime', 'model': 'LSTM', 'model_arch': {'input_size': 1, 'hidden_size': 64, 'num_layers': 2, 'output_size': 1}, 'data': {'target_variable': 'Flow_Kalltveit', 'sequence_length': 25, 'batch_size': 512, 'variables': []}, 'training': {'learning_rate': 0.007702764556375117, 'weight_decay': 0.0001}, 'num_epochs': 30}


In [None]:
df[['config/model', 'train_loss', 'val_loss', 'test_loss', 'time_total_s', 'config/data/variables']].sort_values('test_loss')

Unnamed: 0_level_0,config/model,train_loss,val_loss,test_loss,time_total_s,config/data/variables
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1d4d3_00000,LSTM,1.73464,0.692582,0.902164,26.457007,[]
1d4d3_00001,FCNTemporalAttention,12.955005,8.076083,16.335946,22.637669,[]
