In [1]:
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import Adam
from torch_geometric.loader import DataLoader
from ray import tune, init, cluster_resources
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import ray
from ray.tune.suggest.bayesopt import BayesOptSearch

from nets import SantyxNet
from functions import train_loop, test_loop, scale_target, create_loaders
from processed_datasets import FG_dataset, BM_dataset

BM_dataloader = DataLoader(BM_dataset)

In [2]:
HYPERPARAMS = {}

# NB: The values with tune.choice() are hyperparameters investigated, the others are fixed

# Process-related
HYPERPARAMS["test set"] = True          
HYPERPARAMS["splits"] = 10              
HYPERPARAMS["target scaling"] = "std"   
HYPERPARAMS["batch size"] = tune.choice([16, 32, 64])           
HYPERPARAMS["epochs"] = 200               
HYPERPARAMS["loss function"] = torch.nn.functional.l1_loss   
HYPERPARAMS["lr0"] = tune.choice([0.01, 0.001, 0.0001])       
HYPERPARAMS["patience"] = tune.choice([5, 7, 10])              
HYPERPARAMS["factor"] = tune.choice([0.5, 0.7, 0.9])          
HYPERPARAMS["minlr"] = tune.choice([1e-7, 1e-8])             
HYPERPARAMS["betas"] = (0.9, 0.999)     
HYPERPARAMS["eps"] = tune.choice([1e-8, 1e-9])               
HYPERPARAMS["weight decay"] = 0         
HYPERPARAMS["amsgrad"] = tune.choice([True, False])          

# Model-related
HYPERPARAMS["dim"] = tune.choice([64, 128, 256])                
HYPERPARAMS["sigma"] = torch.nn.ReLU()  
HYPERPARAMS["bias"] = tune.choice([True, False])              
HYPERPARAMS["conv normalize"] = False   
HYPERPARAMS["conv root weight"] = True
HYPERPARAMS["pool ratio"] = tune.choice([0.25, 0.5, 0.75])        
HYPERPARAMS["pool heads"] = tune.choice([2, 4, 6])
HYPERPARAMS["pool seq"] = tune.choice([["GMPool_I"], 
                                       ["GMPool_G"], 
                                       ["GMPool_G", "GMPool_I"],
                                       ["GMPool_G", "SelfAtt", "GMPool_I"],
                                       ["GMPool_G", "SelfAtt", "SelfAtt", "GMPool_I"]])
HYPERPARAMS["pool layer norm"] = False 

## Training function

In [3]:
def train_function(config, checkpoint_dir=None):
    """
    Perform Training with hyperparameter tuning via RayTune.
    Args:
        config (dict): Dictionary with search space (hyperparameters)
    """
    
    # Generate Datasets and scale target
    train_loader, val_loader, test_loader = create_loaders(FG_dataset,
                                                           config["splits"],
                                                           config["batch size"], 
                                                           config["test set"])
    train_loader, val_loader, test_loader, mean, std = scale_target(train_loader,
                                                                    val_loader,
                                                                    test_loader, 
                                                                    mode=config["target scaling"], 
                                                                    test=config["test set"])
    
    # Select device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Call GNN model architecture
    model = SantyxNet(dim=config["dim"],
                      sigma=config["sigma"], 
                      bias=config["bias"], 
                      conv_normalize=config["conv normalize"], 
                      conv_root_weight=config["conv root weight"], 
                      pool_ratio=config["pool ratio"], 
                      pool_heads=config["pool heads"], 
                      pool_seq=config["pool seq"], 
                      pool_layer_norm=config["pool layer norm"]).to(device)
    
    # Call optimizer and lr-scheduler
    optimizer = Adam(model.parameters(),
                     lr=config["lr0"], 
                     betas=config["betas"],
                     eps=config["eps"], 
                     weight_decay=config["weight decay"], 
                     amsgrad=config["amsgrad"])
    lr_scheduler = ReduceLROnPlateau(optimizer,
                                     mode='min',
                                     factor=config["factor"],
                                     patience=config["patience"],
                                     min_lr=config["minlr"])
    
    # Run training
    for epoch in range(1, config["epochs"]+1):
        lr = lr_scheduler.optimizer.param_groups[0]['lr']
        _, train_MAE = train_loop(model, device, train_loader, optimizer, config["loss function"])  
        val_MAE = test_loop(model, val_loader, device, std)
        lr_scheduler.step(val_MAE)  # Adjust the learning rate according to validation error
        if config["test set"]:
            test_MAE = test_loop(model, test_loader, device, std)                                           # Run epoch on test set
            print('Epoch {:03d}: LR={:.7f}  Train MAE: {:.4f} eV  Validation MAE: {:.4f} eV '             
                  'Test MAE: {:.4f} eV'.format(epoch, lr, train_MAE*std, val_MAE, test_MAE))
        else:
            print('Epoch {:03d}: LR={:.7f}  Train MAE: {:.6f} eV  Validation MAE: {:.6f} eV '
                  .format(epoch, lr, train_MAE*std, val_MAE))  
    
    # Collect performance metric
    BM_MAE = test_loop(model, BM_dataloader, device=device, std=std, mean=mean, scaled_graph_label=False)
    FG_MAE = test_MAE             
    tune.report(BM_MAE=BM_MAE, FG_MAE=FG_MAE)    

In [4]:
ray.init(ignore_reinit_error=True)
cluster_resources()

{'accelerator_type:G': 1.0,
 'memory': 7730444699.0,
 'CPU': 8.0,
 'GPU': 1.0,
 'node:192.168.1.161': 1.0,
 'object_store_memory': 3865222348.0}

In [4]:
scheduler = ASHAScheduler(metric="MAE", 
                          mode="min")

algo = BayesOptSearch(random_search_steps=4)

In [5]:
analysis = tune.run(train_function,
                    metric="MAE",
                    mode="min",
                    name="HypOpt2",
                    time_budget_s=3600*24,
                    config=HYPERPARAMS,
                    #scheduler=ASHAScheduler,
                    #checkpoint_freq=5,
                    #progress_reporter=CLIReporter,
                    resources_per_trial={"cpu":8, "gpu":1},
                    num_samples=5, 
                    verbose=1,
                    log_to_file=True, 
                    local_dir="./Hyperparameter_Optimization")

Trial name,# failures,error file
train_function_21d68_00000,1,"/home/santiago/Desktop/GNN/Hyperparameter_Optimization/HypOpt2/train_function_21d68_00000_0_amsgrad=True,batch size=16,bias=False,dim=256,eps=1e-08,factor=0.5,lr0=0.0001,minlr=1e-08,patience=5,_2022-07-26_16-35-02/error.txt"
train_function_21d68_00001,1,"/home/santiago/Desktop/GNN/Hyperparameter_Optimization/HypOpt2/train_function_21d68_00001_1_amsgrad=False,batch size=32,bias=True,dim=256,eps=1e-09,factor=0.7,lr0=0.001,minlr=1e-08,patience=7,p_2022-07-26_16-35-04/error.txt"


[2m[36m(train_function pid=5265)[0m 2022-07-26 16:36:33,303	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(train_function pid=5265)[0m Traceback (most recent call last):
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 770, in ray._raylet.task_execution_handler
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 591, in ray._raylet.execute_task
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 629, in ray._raylet.execute_task
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(train_function pid=5265)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(train_function pid=5265)[0m   File "/home/santiago/anaconda3/envs/GNN/lib/python3.9/site-packages/ray/

In [8]:
analysis.best_config

NameError: name 'analysis' is not defined

In [11]:
analysis.best_dataframe

Unnamed: 0,MAE,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore
0,2.617189,412.595693,False,,,1,44606_00000,74eb6cb974c54abcb6f24c6098beadbb,2022-07-22_11-25-31,1658481931,412.595693,11616,santimor95-thinkpad-e14,10.0.7.116,412.595693,0,1


In [1]:
analysis.default_metric

NameError: name 'analysis' is not defined

In [15]:
analysis.dataframe()

Unnamed: 0,MAE,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,timesteps_since_restore,iterations_since_restore,config/amsgrad,config/batch_size,config/dim,config/eps,config/factor,config/lr0,config/patience,logdir
0,2.617189,412.595693,False,,,1,44606_00000,74eb6cb974c54abcb6f24c6098beadbb,2022-07-22_11-25-31,1658481931,...,0,1,True,64,256,1e-08,0.25,0.001,7,/home/santiago/ray_results/HypOpt1/train_funct...
1,,213.624106,False,,,1,44606_00001,44c5eff9ca374ed1a4d4e3c60eb77db4,2022-07-22_11-22-14,1658481734,...,0,1,False,16,128,0.0,0.75,0.01,10,/home/santiago/ray_results/HypOpt1/train_funct...
2,3.063185,413.152003,False,,,1,44606_00002,2c2cba81c7df4892a26bc1392b79e802,2022-07-22_11-25-34,1658481934,...,0,1,False,64,256,1e-08,0.5,0.001,10,/home/santiago/ray_results/HypOpt1/train_funct...
3,7.731067,178.565049,False,,,1,44606_00003,4805db48877644869ae76433fa7effb3,2022-07-22_11-21-40,1658481700,...,0,1,False,64,128,1e-08,0.75,0.001,10,/home/santiago/ray_results/HypOpt1/train_funct...
4,,138.826537,False,,,1,44606_00004,cbb85f1229e84cb68203e032056c468a,2022-07-22_11-24-05,1658481845,...,0,1,True,32,128,0.0,0.25,0.01,10,/home/santiago/ray_results/HypOpt1/train_funct...
