# Running hyperparameter optimization on RoBERTa model using RayTune

## Import packages

In [1]:
from pathlib import Path
import os
import sys

import pandas as pd
from lightning import pytorch as pl
import numpy as np
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler
import torch
from torch.utils.data import DataLoader

os.environ['PYTHONPATH'] = '../agenticadmet'
sys.path.insert(0, '../agenticadmet')
from datasets import RegressionDataset
from models import TransformerRegressionModel
from utils import CheckpointParams

[23:20:49] Initializing Normalizer


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
input_path = Path('../data/asap/datasets/rnd_splits/split_0.csv')
NUM_WORKERS = 0 # number of workers for dataloader. 0 means using main process for data loading
SMILES_COLUMN = 'cxsmiles_std' # name of the column containing SMILES strings
TARGET_COLUMNS = ['LogHLM', 'LogMLM', 'LogD', 'LogKSOL', 'LogMDR1-MDCKII'] # list of names of the columns containing targets

MODEL_PARAMS = {
    'config': {
        'vocab_size': 500,
        'hidden_size': 384,
        'num_hidden_layers': 6,
        'num_attention_heads': 8,
        'intermediate_size': 1024,
        'hidden_act': "gelu",
        'hidden_dropout_prob': 0.1,
        'attention_probs_dropout_prob': 0.1,
        'max_position_embeddings': 512,
        'initializer_range': 0.02,
        'layer_norm_eps': 1e-12,
        'pad_token_id': 0,
        'position_embedding_type': "absolute",
        'use_cache': True,
        'type_vocab_size': 2
    },
    'output_dim': len(TARGET_COLUMNS),
    'bias_final': False
}
TOKENIZER_NAME = '<gs_bucket>/artifacts/tokenizers/zinc'
CHECKPOINTS = [
    CheckpointParams(
        path=str(Path('../output/artifacts/mol_mlm_roberta_zinc/last.ckpt').absolute()),
        module_from='roberta',
        module_to='roberta',
        strict=True
    )
]
# CHECKPOINTS = None

hpopt_save_dir = Path('../output/asap/rnd_splits/roberta/run_0/split_0/hpopt') # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True, parents=True)

## Load data

In [4]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,,376.0,1.0,,8.5,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1 |&1:3|,300,|&1:3|,,,2.576341,0.977724,train
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,,375.0,-0.3,,0.9,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1 |&1:11|,249,|&1:11|,,,2.575188,0.278754,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,CC(C)NC[C@H](O)COC1=CC=CC2=CC=CC=C12 |&1:5|,25.5,,,63.0,,CC(C)NC[C@H](O)COc1cccc2ccccc12,CC(C)NC[C@H](O)COc1cccc2ccccc12 |&1:5|,22,|&1:5|,1.423246,1.806180,,,val
400,O=C(O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl,216.0,,,386.0,,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,380,,2.336460,2.587711,,,val
401,NCC1=CC(Cl)=CC(C(=O)NC2=CC=C3CNCC3=C2)=C1,,,2.0,,,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,303,,,,,,train
402,COC(=O)NC1=NC2=CC=C(C(=O)C3=CC=CC=C3)C=C2N1,,,2.9,,,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,166,,,,,,train


## Make data points, splits, and datasets

In [5]:
train_dset = RegressionDataset(
    data_path=input_path,
    smiles_col=SMILES_COLUMN,
    target_cols=TARGET_COLUMNS,
    split='train',
    tokenizer_name=TOKENIZER_NAME,
    mol_masking_prob=0.3,
    mol_masking_val=0.15
)
val_dset = RegressionDataset(
    data_path=input_path,
    smiles_col=SMILES_COLUMN,
    target_cols=TARGET_COLUMNS,
    split='val',
    tokenizer_name=TOKENIZER_NAME
)

0 out of 323 rows are removed due to missing values
Downloading checkpoint from <gs_bucket>/artifacts/tokenizers/zinc...
0 out of 81 rows are removed due to missing values
Downloading checkpoint from <gs_bucket>/artifacts/tokenizers/zinc...


# Define helper function to train the model

In [6]:
def train_model(config, train_dset, val_dset):
    # config is a dictionary containing hyperparameters used for the trial
    model_params = MODEL_PARAMS.copy()
    model_params['hidden_dim'] = int(config['hidden_dim'])
    model_params['num_layers'] = int(config['num_layers'])
    model_params['dropout'] = float(config['dropout'])
    batch_size = int(config['batch_size'])
    weight_decay = float(config['weight_decay'])

    train_loader = DataLoader(
        train_dset, batch_size=batch_size, shuffle=True,
        num_workers=NUM_WORKERS, collate_fn=train_dset.collate_fn
    )
    val_loader = DataLoader(
        val_dset, batch_size=batch_size, shuffle=False,
        num_workers=NUM_WORKERS, collate_fn=val_dset.collate_fn
    )

    model = TransformerRegressionModel(
        model_name='roberta-base',
        model_params=model_params,
        weight_decay=weight_decay,
        checkpoints=CHECKPOINTS
    )

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=50, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
        enable_checkpointing=False
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)

## Define parameter search space

In [7]:
search_space = {
    "hidden_dim": tune.qrandint(lower=128, upper=1024, q=128),
    "num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "batch_size": tune.qrandint(lower=16, upper=128, q=16),
    "weight_decay": tune.loguniform(lower=1e-5, upper=1e-1),
    "dropout": tune.uniform(lower=0.0, upper=0.2),
}

In [8]:
ray.shutdown()
ray.init(include_dashboard=False)

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=True, # set to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
    checkpoint_frequency=0,        # Do not checkpoint during training
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=(hpopt_save_dir / "ray_results").absolute(), # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(config, train_dset, val_dset),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=10, # number of random evaluations before tree parzen estimators
    random_state_seed=RANDOM_SEED,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="val/mae",
    mode="min",
    num_samples=30, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths
)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()

0,1
Current time:,2025-03-12 23:59:27
Running for:,00:38:32.67
Memory:,11.2/58.9 GiB

Trial name,status,loc,train_loop_config/ba tch_size,train_loop_config/dr opout,train_loop_config/hi dden_dim,train_loop_config/nu m_layers,train_loop_config/we ight_decay,iter,total time (s),train_loss,train_loss_step,val/mae
TorchTrainer_56fc16e6,TERMINATED,10.128.0.3:1177991,96,0.018611,896,2,2.81897e-05,50,66.3584,0.0906924,0.063619,0.407999
TorchTrainer_b2eb9de9,TERMINATED,10.128.0.3:1179654,32,0.173402,768,2,0.00777321,50,65.1964,0.130427,0.0826093,0.440786
TorchTrainer_bd4b22c9,TERMINATED,10.128.0.3:1181264,64,0.0257109,128,1,1.26546e-05,50,63.5159,0.128798,0.0521069,0.408013
TorchTrainer_439b693d,TERMINATED,10.128.0.3:1182831,64,0.0786423,256,1,0.0439758,50,63.6506,0.141451,0.0990262,0.448804
TorchTrainer_e0d70199,TERMINATED,10.128.0.3:1184502,32,0.156513,896,1,0.0212319,50,64.2665,0.0991244,0.0568382,0.422184
TorchTrainer_5a576c82,TERMINATED,10.128.0.3:1186232,32,0.0341362,384,1,0.000809838,50,63.3507,0.0921696,0.0925356,0.386482
TorchTrainer_08c9cf27,TERMINATED,10.128.0.3:1187943,48,0.114586,256,2,0.00352467,50,62.2794,0.11269,0.0962803,0.404122
TorchTrainer_599a7bc7,TERMINATED,10.128.0.3:1189481,128,0.0826331,896,1,0.00694254,50,66.0455,0.137496,0.155992,0.434657
TorchTrainer_5f4597d7,TERMINATED,10.128.0.3:1191042,16,0.0430375,1024,2,0.000120687,50,73.1326,0.0779466,0.0577919,0.396328
TorchTrainer_481a02c2,TERMINATED,10.128.0.3:1192914,48,0.0493471,512,1,0.000418933,50,62.508,0.0785766,0.0488015,0.421358


[33m(raylet)[0m [2025-03-12 23:21:02,667 E 1176943 1176979] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-12_23-20-50_987198_1174294 is over 95% full, available space: 13.1806 GB; capacity: 295.046 GB. Object creation will fail if spilling is required.
[36m(TorchTrainer pid=1177991)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=1177991)[0m - (node_id=6cf2562c5de19587d3dc355f8ebead9f15bbc1bb9651b7563d063b82, ip=10.128.0.3, pid=1178182) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=1178182)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=1178182)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1178182)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1178182)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1178182)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1178182)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1178182)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1178182)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1178182)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1178182)[0m 
[36m(RayTrainWorker pid=1178182)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1179805)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1179805)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1179805)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1179805)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1179805)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1179805)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1179805)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1179805)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1179805)[0m 
[36m(RayTrainWorker pid=1179805)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1181415)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1181415)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1181415)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1181415)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1181415)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1181415)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1181415)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1181415)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1181415)[0m 
[36m(RayTrainWorker pid=1181415)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1183060)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=1183060)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1183060)[0m Loading checkpoint from roberta to roberta...


[36m(RayTrainWorker pid=1183060)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1183060)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1183060)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1183060)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1183060)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1183060)[0m 
[36m(RayTrainWorker pid=1183060)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1184670)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1184670)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1184670)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1184670)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1184670)[0m 
[36m(RayTrainWorker pid=1184670)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1184670)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1184670)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1184670)[0m 1 | predictor | MLP          | 349 K  | train
[36m(RayTrainWorker pid=1184670)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1184670)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1184670)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1184670)[0m 9.0 M     Trainable params
[36m(RayTrainWorker pid=1184670)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1184670)[0m 9.0 M     Total params
[36m(RayTrainWorker pid=1184670)[0m 36.096    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1186470)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=1186470)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1186470)[0m Loading checkpoint from roberta to roberta...


[36m(RayTrainWorker pid=1186470)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1186470)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1186470)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1186470)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1186470)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1186470)[0m 
[36m(RayTrainWorker pid=1186470)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1188093)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1188093)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1188093)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1188093)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1188093)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1188093)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1188093)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1188093)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1188093)[0m 
[36m(RayTrainWorker pid=1188093)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1189646)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1189646)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1189646)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1189646)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1189646)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1189646)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1189646)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1189646)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1189646)[0m 
[36m(RayTrainWorker pid=1189646)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1191270)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1191270)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1191270)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1191270)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1191270)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1191270)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1191270)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1191270)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1191270)[0m 
[36m(RayTrainWorker pid=1191270)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1193133)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1193133)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1193133)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1193133)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1193133)[0m 
[36m(RayTrainWorker pid=1193133)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1193133)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1193133)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1193133)[0m 1 | predictor | MLP          | 199 K  | train
[36m(RayTrainWorker pid=1193133)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1193133)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1193133)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1193133)[0m 8.9 M     Trainable params
[36m(RayTrainWorker pid=1193133)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1193133)[0m 8.9 M     Total params
[36m(RayTrainWorker pid=1193133)[0m 35.497    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1194751)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1194751)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1194751)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1194751)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1194751)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1194751)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1194751)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[33m(raylet)[0m [2025-03-12 23:33:43,318 E 1176943 1176979] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-12_23-20-50_987198_1174294 is over 95% full, available space: 13.1734

[36m(RayTrainWorker pid=1196443)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1196443)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1196443)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1196443)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1196443)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1196443)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1196443)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1196443)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1196443)[0m 
[36m(RayTrainWorker pid=1196443)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1198249)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1198249)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1198249)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1198249)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1198249)[0m 
[36m(RayTrainWorker pid=1198249)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1198249)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1198249)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1198249)[0m 1 | predictor | MLP          | 297 K  | train
[36m(RayTrainWorker pid=1198249)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1198249)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1198249)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1198249)[0m 9.0 M     Trainable params
[36m(RayTrainWorker pid=1198249)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1198249)[0m 9.0 M     Total params
[36m(RayTrainWorker pid=1198249)[0m 35.889    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1199986)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1199986)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1199986)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1199986)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1199986)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1199986)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1199986)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1199986)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1199986)[0m 
[36m(RayTrainWorker pid=1199986)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1201705)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1201705)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1201705)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1201705)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1201705)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1201705)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1201705)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1201705)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1201705)[0m 
[36m(RayTrainWorker pid=1201705)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1203341)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1203341)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1203341)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1203341)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1203341)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1203341)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1203341)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1203341)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1203341)[0m 
[36m(RayTrainWorker pid=1203341)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1205068)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1205068)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1205068)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1205068)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1205068)[0m 
[36m(RayTrainWorker pid=1205068)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1205068)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1205068)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1205068)[0m 1 | predictor | MLP          | 99.8 K | train
[36m(RayTrainWorker pid=1205068)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1205068)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1205068)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1205068)[0m 8.8 M     Trainable params
[36m(RayTrainWorker pid=1205068)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1205068)[0m 8.8 M     Total params
[36m(RayTrainWorker pid=1205068)[0m 35.098    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1206756)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1206756)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1206756)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1206756)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1206756)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1206756)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1206756)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1206756)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1206756)[0m 
[36m(RayTrainWorker pid=1206756)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1208580)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1208580)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1208580)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1208580)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1208580)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1208580)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1208580)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1208580)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1208580)[0m 
[36m(RayTrainWorker pid=1208580)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1210437)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1210437)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1210437)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1210437)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1210437)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1210437)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1210437)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1210437)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1210437)[0m 
[36m(RayTrainWorker pid=1210437)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1212072)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1212072)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1212072)[0m <All keys matched successfully>


[33m(raylet)[0m [2025-03-12 23:46:43,994 E 1176943 1176979] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-12_23-20-50_987198_1174294 is over 95% full, available space: 13.1613 GB; capacity: 295.046 GB. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=1212072)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1212072)[0m 
[36m(RayTrainWorker pid=1212072)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1212072)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1212072)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1212072)[0m 1 | predictor | MLP          | 890 K  | train
[36m(RayTrainWorker pid=1212072)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1212072)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1212072)[0m ---------------------------------------------------
[36m(

[36m(RayTrainWorker pid=1213698)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1213698)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1213698)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1213698)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1213698)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1213698)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1213698)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1213698)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1213698)[0m 
[36m(RayTrainWorker pid=1213698)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1215479)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1215479)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1215479)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1215479)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1215479)[0m 
[36m(RayTrainWorker pid=1215479)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1215479)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1215479)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1215479)[0m 1 | predictor | MLP          | 659 K  | train
[36m(RayTrainWorker pid=1215479)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1215479)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1215479)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1215479)[0m 9.3 M     Trainable params
[36m(RayTrainWorker pid=1215479)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1215479)[0m 9.3 M     Total params
[36m(RayTrainWorker pid=1215479)[0m 37.338    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1217098)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1217098)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=1217098)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...


[36m(RayTrainWorker pid=1217098)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1217098)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1217098)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1217098)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1217098)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1217098)[0m 
[36m(RayTrainWorker pid=1217098)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1218729)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1218729)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1218729)[0m <All keys matched successfully>


[33m(raylet)[0m [2025-03-12 23:51:54,274 E 1176943 1176979] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-12_23-20-50_987198_1174294 is over 95% full, available space: 13.1639 GB; capacity: 295.046 GB. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=1218729)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1218729)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1218729)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1218729)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip inst

[36m(RayTrainWorker pid=1220550)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1220550)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=1220550)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...


[36m(RayTrainWorker pid=1220550)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1220550)[0m 
[36m(RayTrainWorker pid=1220550)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1220550)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1220550)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1220550)[0m 1 | predictor | MLP          | 165 K  | train
[36m(RayTrainWorker pid=1220550)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1220550)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1220550)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1220550)[0m 8.8 M     Trainable params
[36m(RayTrainWorker pid=1220550)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1220550)[0m 8.8 M     Total params
[36m(RayTrainWorker pid=1220550)[0m 35.361    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1222149)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1222149)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1222149)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1222149)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1222149)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1222149)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1222149)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1222149)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1222149)[0m 
[36m(RayTrainWorker pid=1222149)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1224012)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1224012)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1224012)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1224012)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1224012)[0m 
[36m(RayTrainWorker pid=1224012)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1224012)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1224012)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1224012)[0m 1 | predictor | MLP          | 149 K  | train
[36m(RayTrainWorker pid=1224012)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1224012)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1224012)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1224012)[0m 8.8 M     Trainable params
[36m(RayTrainWorker pid=1224012)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1224012)[0m 8.8 M     Total params
[36m(RayTrainWorker pid=1224012)[0m 35.297    Total estimated model params size (MB)

[36m(RayTrainWorker pid=1225601)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1225601)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1225601)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1225601)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1225601)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1225601)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1225601)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=1225601)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1225601)[0m 
[36m(RayTrainWorker pid=1225601)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=1227228)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1227228)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1227228)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1227228)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1227228)[0m 
[36m(RayTrainWorker pid=1227228)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1227228)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1227228)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1227228)[0m 1 | predictor | MLP          | 399 K  | train
[36m(RayTrainWorker pid=1227228)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1227228)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1227228)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1227228)[0m 9.1 M     Trainable params
[36m(RayTrainWorker pid=1227228)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1227228)[0m 9.1 M     Total params
[36m(RayTrainWorker pid=1227228)[0m 36.296    Total estimated model params size (MB)

## Hyperparameter optimization results

In [9]:
# results of all trials
result_df = results.get_dataframe()
results_df = result_df.sort_values('val/mae')
results_df

Unnamed: 0,train_loss,train_loss_step,val/mae,val/r2,val_loss,lr,train_loss_epoch,epoch,step,timestamp,...,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/hidden_dim,config/train_loop_config/num_layers,config/train_loop_config/batch_size,config/train_loop_config/weight_decay,config/train_loop_config/dropout,logdir
5,0.09217,0.092536,0.386482,0.669224,0.316027,0.0001,0.09217,49,550,1741822100,...,dl-vladvin-1,10.128.0.3,63.350655,50,384,1,32,0.00081,0.034136,5a576c82
8,0.077947,0.057792,0.396328,0.657834,0.322932,0.0001,0.077947,49,1050,1741822332,...,dl-vladvin-1,10.128.0.3,73.132622,50,1024,2,16,0.000121,0.043038,5f4597d7
28,0.095285,0.085481,0.399819,0.650773,0.327451,0.0001,0.095285,49,350,1741823882,...,dl-vladvin-1,10.128.0.3,63.111884,50,128,2,48,2e-05,0.015735,0d3d2f70
17,0.099747,0.02648,0.400149,0.641377,0.346764,0.0001,0.099747,49,1050,1741823031,...,dl-vladvin-1,10.128.0.3,75.221304,50,1024,2,16,0.000154,0.041954,995662df
6,0.11269,0.09628,0.404122,0.661672,0.31783,0.0001,0.11269,49,350,1741822173,...,dl-vladvin-1,10.128.0.3,62.27943,50,256,2,48,0.003525,0.114586,08c9cf27
19,0.077075,0.063354,0.405421,0.651078,0.329478,0.0001,0.077075,49,350,1741823186,...,dl-vladvin-1,10.128.0.3,63.248566,50,512,2,48,0.00043,0.027994,137d3681
22,0.091107,0.131095,0.406214,0.659257,0.31761,0.0001,0.091107,49,350,1741823421,...,dl-vladvin-1,10.128.0.3,64.408439,50,640,2,48,3.1e-05,0.003652,ede8484b
12,0.12978,0.161829,0.407775,0.666119,0.310546,0.0001,0.12978,49,150,1741822634,...,dl-vladvin-1,10.128.0.3,66.980362,50,384,2,128,0.001676,0.067281,8eaba676
29,0.09279,0.080546,0.407854,0.673837,0.313061,0.0001,0.09279,49,1050,1741823966,...,dl-vladvin-1,10.128.0.3,72.617448,50,1024,1,16,0.000209,0.034451,f780394e
0,0.090692,0.063619,0.407999,0.666391,0.310292,0.0001,0.090692,49,200,1741821729,...,dl-vladvin-1,10.128.0.3,66.358428,50,896,2,96,2.8e-05,0.018611,56fc16e6


In [10]:
results.get_best_result(metric="val/mae", mode="min")

Result(
  metrics={'train_loss': 0.09216958284378052, 'train_loss_step': 0.092535600066185, 'val/mae': 0.38648226857185364, 'val/r2': 0.6692236661911011, 'val_loss': 0.31602737307548523, 'lr': 9.999999747378752e-05, 'train_loss_epoch': 0.09216958284378052, 'epoch': 49, 'step': 550},
  path='/home/jupyter/AgenticADMET/notebooks/../output/asap/rnd_splits/roberta/run_0/split_0/hpopt/ray_results/TorchTrainer_2025-03-12_23-20-54/5a576c82',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/home/jupyter/AgenticADMET/notebooks/../output/asap/rnd_splits/roberta/run_0/split_0/hpopt/ray_results/TorchTrainer_2025-03-12_23-20-54/5a576c82/checkpoint_000049)
)

In [11]:
results_df.to_csv(hpopt_save_dir / 'results.csv', index=False)

In [12]:
ray.shutdown()