In [3]:
from functions import get_graph_formula, scale_target, create_loaders
from classes import *
from constants import *
from graph_tools import *
from processed_datasets import *
from matplotlib import pyplot as plt
import numpy as np
import torch 
import torch_geometric  
import torch_geometric.transforms as T
import torch.nn.functional as F

In [4]:
from ray import tune, init
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

## Data Loaders

In [5]:
gnn_dataset = (#Intermediates_dataset,
               RPCA_dataset,
               #Benson_dataset,
               group2_dataset,
               group2b_dataset,
               aromatics_dataset,
               #Alloys_dataset,
               aromatics2_dataset,
               amides_dataset,
               amidines_dataset,
               oximes_dataset,
               carbamate_esters_dataset,
               group3S_dataset,
               group3N_dataset,
               group4_dataset,
               gas_amides_dataset,
               gas_amidines_dataset,
               gas_aromatics_dataset,
               gas_aromatics2_dataset,
               gas_carbamate_esters_dataset,
               gas_group2_dataset,
               gas_group2b_dataset,
               gas_group3N_dataset,
               gas_group3S_dataset,
               gas_group4_dataset,
               gas_oximes_dataset) 

## Train/Test loops

In [6]:
def train_loop(epoch, model, optimizer, train_loader, device):
    """
    Helper function for training over an epoch. 
    For each batch in the epoch, the following actions are performed:
    1) Move the batch to the selected device for training
    2) Forward pass through the GNN model and loss function computation
    3) Compute gradient of loss function wrt model parameters
    4) Update model parameters
    Args:
        epoch(int): Epoch number.
    Returns:
        loss_all(float): Mean Squared Error (MSE) of the whole epoch.   
    """
    model.train()  # Sets model in training mode
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()  # Sets the gradients of all tensors to zero
        loss = F.mse_loss(model(data), data.y)
        loss.backward()  # Compute the gradient of the loss function wrt parameters
        loss_all += loss.item() * data.num_graphs
        optimizer.step() # Update the model parameters
    loss_all /= len(train_loader.dataset)
    return loss_all 

In [7]:
def test_loop(loader, model, std_tv, device):
    """
    Helper function for validation/testing.
    For each batch in the validation/test epoch, the following actions are performed:
    1) Set the GNN model in evaluation mode
    2) Move the batch to the selected device where the model is stored
    3) Compute the Mean Absolute Error (MAE)
    Args:
        loader(Dataloader object): Dataset for validation/testing
    Returns:
        error(float): Mean Absolute Error (MAE)
    """
    model.eval()  # Sets model in evaluation (inference) mode
    error = 0
    for data in loader:
        data = data.to(device)
        error += (model(data) * std_tv - data.y * std_tv).abs().sum().item()  
    error /= len(loader.dataset)
    return error 

## Training function

In [8]:
def train_GNN(config, checkpoint_dir=None):
    """
    Perform Training with hyperparameter tuning via RayTune.
    Args:
        config(dict): dictionary with search space
    """
    train_loader, val_loader, test_loader = create_loaders(gnn_dataset, 5, batch_size=config["batch_size"])
    train_loader, val_loader, test_loader, mean_tv, std_tv = scale_target(train_loader, val_loader, test_loader)
    device = "cpu"
    model = Net(dim=config["dim"], node_features=node_features).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=5, min_lr=1e-5)
    best_val_error = None
    for epoch in range(1, 301):
        lr = scheduler.optimizer.param_groups[0]['lr']
        loss = train_loop(epoch, model, optimizer, train_loader, device)  
        val_error = test_loop(val_loader, model, std_tv, device)
        scheduler.step(val_error)  # Adjust the learning rate according to validation error

        if best_val_error is None or val_error <= best_val_error:
            #test_error = test_loop(test_loader)
            best_val_error = val_error
        test_error = test_loop(test_loader, model, std_tv, device)
    
    tune.report(MAE=test_error)    

In [9]:
search_space = {
    "dim": tune.choice([32, 64, 128, 256, 512]),
    "batch_size": tune.choice([16, 32, 64, 128])
}

In [10]:
init(num_cpus=6)
analysis = tune.run(train_GNN, config=search_space, mode="min")



Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


[2m[36m(train_GNN pid=13530)[0m Training data = 2103 Validation data = 684 Test data = 684 (Total = 3471)
[2m[36m(train_GNN pid=13530)[0m Target Scaling (Standardization) applied successfully
[2m[36m(train_GNN pid=13530)[0m (Train+Val) mean: -64.38 eV
[2m[36m(train_GNN pid=13530)[0m (Train+Val) standard deviation: 25.51 eV


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,RUNNING,192.168.1.163:13530,16,512


[2m[36m(train_GNN pid=13530)[0m 2022-03-15 16:58:43,916	ERROR function_runner.py:268 -- Runner Thread raised error.
[2m[36m(train_GNN pid=13530)[0m Traceback (most recent call last):
[2m[36m(train_GNN pid=13530)[0m   File "/home/santiago/anaconda3/envs/GNN/lib/python3.9/site-packages/ray/tune/function_runner.py", line 262, in run
[2m[36m(train_GNN pid=13530)[0m     self._entrypoint()
[2m[36m(train_GNN pid=13530)[0m   File "/home/santiago/anaconda3/envs/GNN/lib/python3.9/site-packages/ray/tune/function_runner.py", line 330, in entrypoint
[2m[36m(train_GNN pid=13530)[0m     return self._trainable_func(self.config, self._status_reporter,
[2m[36m(train_GNN pid=13530)[0m   File "/home/santiago/anaconda3/envs/GNN/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(train_GNN pid=13530)[0m     return method(self, *_args, **_kwargs)
[2m[36m(train_GNN pid=13530)[0m   File "/home/santiago/anaconda3/envs/GNN/lib/python3.9/si

Result for train_GNN_ae0a5_00000:
  date: 2022-03-15_16-57-57
  experiment_id: e7530ad7378b40ae98a9c3134582984b
  hostname: santimor95-thinkpad-e14
  node_ip: 192.168.1.163
  pid: 13530
  timestamp: 1647359877
  trial_id: ae0a5_00000
  


Trial name,status,loc,batch_size,dim
train_GNN_ae0a5_00000,ERROR,192.168.1.163:13530,16,512

Trial name,# failures,error file
train_GNN_ae0a5_00000,1,"/home/santiago/ray_results/train_GNN_2022-03-15_16-57-56/train_GNN_ae0a5_00000_0_batch_size=16,dim=512_2022-03-15_16-57-56/error.txt"


TuneError: ('Trials did not complete', [train_GNN_ae0a5_00000])