# Running hyperparameter optimization on RoBERTa model using RayTune

## Import packages

In [1]:
from pathlib import Path
import os
import sys

import pandas as pd
from lightning import pytorch as pl
import numpy as np
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler
import torch
from torch.utils.data import DataLoader

os.environ['PYTHONPATH'] = '../agenticadmet'
sys.path.insert(0, '../agenticadmet')
from datasets import RegressionDataset
from models import TransformerRegressionModel
from utils import CheckpointParams

[23:09:33] Initializing Normalizer


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
input_path = Path('../data/asap/datasets/rnd_splits/split_0.csv')
NUM_WORKERS = 0 # number of workers for dataloader. 0 means using main process for data loading
SMILES_COLUMN = 'cxsmiles_std' # name of the column containing SMILES strings
TARGET_COLUMNS = ['LogHLM', 'LogMLM', 'LogD', 'LogKSOL', 'LogMDR1-MDCKII'] # list of names of the columns containing targets

MODEL_PARAMS = {
    'config': {
        'vocab_size': 500,
        'hidden_size': 384,
        'num_hidden_layers': 6,
        'num_attention_heads': 8,
        'intermediate_size': 1024,
        'hidden_act': "gelu",
        'hidden_dropout_prob': 0.1,
        'attention_probs_dropout_prob': 0.1,
        'max_position_embeddings': 512,
        'initializer_range': 0.02,
        'layer_norm_eps': 1e-12,
        'pad_token_id': 0,
        'position_embedding_type': "absolute",
        'use_cache': True,
        'type_vocab_size': 2
    },
    'output_dim': len(TARGET_COLUMNS),
    'bias_final': False
}
TOKENIZER_NAME = '<gs_bucket>/artifacts/tokenizers/zinc'
CHECKPOINTS = [
    CheckpointParams(
        path=str(Path('../output/artifacts/mol_mlm_roberta_zinc/last.ckpt').absolute()),
        module_from='roberta',
        module_to='roberta',
        strict=True
    )
]
# CHECKPOINTS = None

hpopt_save_dir = Path('../output/asap/rnd_splits/roberta/run_0/split_0/hpopt') # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True, parents=True)

## Load data

In [4]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,,376.0,1.0,,8.5,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1 |&1:3|,300,|&1:3|,,,2.576341,0.977724,train
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,,375.0,-0.3,,0.9,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1 |&1:11|,249,|&1:11|,,,2.575188,0.278754,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,CC(C)NC[C@H](O)COC1=CC=CC2=CC=CC=C12 |&1:5|,25.5,,,63.0,,CC(C)NC[C@H](O)COc1cccc2ccccc12,CC(C)NC[C@H](O)COc1cccc2ccccc12 |&1:5|,22,|&1:5|,1.423246,1.806180,,,val
430,O=C(O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl,216.0,,,386.0,,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,380,,2.336460,2.587711,,,val
431,NCC1=CC(Cl)=CC(C(=O)NC2=CC=C3CNCC3=C2)=C1,,,2.0,,,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,303,,,,,,train
432,COC(=O)NC1=NC2=CC=C(C(=O)C3=CC=CC=C3)C=C2N1,,,2.9,,,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,166,,,,,,train


## Make data points, splits, and datasets

In [5]:
train_dset = RegressionDataset(
    data_path=input_path,
    smiles_col=SMILES_COLUMN,
    target_cols=TARGET_COLUMNS,
    split='train',
    tokenizer_name=TOKENIZER_NAME,
    mol_masking_prob=0.3,
    mol_masking_val=0.15
)
val_dset = RegressionDataset(
    data_path=input_path,
    smiles_col=SMILES_COLUMN,
    target_cols=TARGET_COLUMNS,
    split='val',
    tokenizer_name=TOKENIZER_NAME
)

0 out of 347 rows are removed due to missing values
Downloading checkpoint from <gs_bucket>/artifacts/tokenizers/zinc...
0 out of 87 rows are removed due to missing values
Downloading checkpoint from <gs_bucket>/artifacts/tokenizers/zinc...


# Define helper function to train the model

In [6]:
def train_model(config, train_dset, val_dset):
    # config is a dictionary containing hyperparameters used for the trial
    model_params = MODEL_PARAMS.copy()
    model_params['hidden_dim'] = int(config['hidden_dim'])
    model_params['num_layers'] = int(config['num_layers'])
    model_params['dropout'] = float(config['dropout'])
    batch_size = int(config['batch_size'])
    weight_decay = float(config['weight_decay'])

    train_loader = DataLoader(
        train_dset, batch_size=batch_size, shuffle=True,
        num_workers=NUM_WORKERS, collate_fn=train_dset.collate_fn
    )
    val_loader = DataLoader(
        val_dset, batch_size=batch_size, shuffle=False,
        num_workers=NUM_WORKERS, collate_fn=val_dset.collate_fn
    )

    model = TransformerRegressionModel(
        model_name='roberta-base',
        model_params=model_params,
        weight_decay=weight_decay,
        checkpoints=CHECKPOINTS
    )

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=50, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
        enable_checkpointing=False
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)

## Define parameter search space

In [7]:
search_space = {
    "hidden_dim": tune.qrandint(lower=128, upper=1024, q=128),
    "num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "batch_size": tune.qrandint(lower=16, upper=128, q=16),
    "weight_decay": tune.loguniform(lower=1e-5, upper=1e-1),
    "dropout": tune.uniform(lower=0.0, upper=0.3),
}

In [8]:
ray.shutdown()
ray.init(include_dashboard=False)

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=True, # set to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
    checkpoint_frequency=0,        # Do not checkpoint during training
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=(hpopt_save_dir / "ray_results").absolute(), # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(config, train_dset, val_dset),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=10, # number of random evaluations before tree parzen estimators
    random_state_seed=RANDOM_SEED,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="val/mae",
    mode="min",
    num_samples=30, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths
)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()

0,1
Current time:,2025-03-04 23:59:57
Running for:,00:50:19.26
Memory:,13.4/58.9 GiB

Trial name,status,loc,train_loop_config/ba tch_size,train_loop_config/dr opout,train_loop_config/hi dden_dim,train_loop_config/nu m_layers,train_loop_config/we ight_decay,iter,total time (s),train_loss,train_loss_step,val/mae
TorchTrainer_fb50619e,TERMINATED,10.128.0.3:1998770,96,0.0279165,896,2,2.81897e-05,50,91.6121,0.104623,0.132081,0.388577
TorchTrainer_779d5109,TERMINATED,10.128.0.3:2000967,32,0.260103,768,2,0.00777321,50,82.8557,0.115772,0.131511,0.373488
TorchTrainer_6a72a8c7,TERMINATED,10.128.0.3:2002897,64,0.0385663,128,1,1.26546e-05,50,93.565,0.123199,0.122134,0.357081
TorchTrainer_c461b4c9,TERMINATED,10.128.0.3:2005253,64,0.117963,256,1,0.0439758,50,88.1537,0.1332,0.137923,0.339789
TorchTrainer_ec74ee07,TERMINATED,10.128.0.3:2007392,32,0.234769,896,1,0.0212319,50,86.9308,0.125066,0.109017,0.387317
TorchTrainer_4182d5be,TERMINATED,10.128.0.3:2009336,32,0.0512044,384,1,0.000809838,50,92.4449,0.104219,0.108169,0.368202
TorchTrainer_10655d53,TERMINATED,10.128.0.3:2011580,48,0.17188,256,2,0.00352467,50,91.6778,0.167945,0.0759322,0.38596
TorchTrainer_39b90ed6,TERMINATED,10.128.0.3:2013800,128,0.12395,896,1,0.00694254,50,88.4136,0.14475,0.144648,0.36224
TorchTrainer_fa206fc2,TERMINATED,10.128.0.3:2015924,16,0.0645563,1024,2,0.000120687,50,93.4759,0.107526,0.0995122,0.377098
TorchTrainer_6fa2aff5,TERMINATED,10.128.0.3:2018141,48,0.0740207,512,1,0.000418933,50,82.8337,0.110213,0.0949162,0.360723


[36m(TorchTrainer pid=1998770)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=1998770)[0m - (node_id=3c7d24990431db7478a78ed43e9722f41114659a2ee58c81b4c30a4a, ip=10.128.0.3, pid=1998956) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=1998956)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=1998956)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=1998956)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=1998956)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=1998956)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless t

[36m(RayTrainWorker pid=1998956)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=1998956)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=1998956)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=1998956)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=1998956)[0m 
[36m(RayTrainWorker pid=1998956)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=1998956)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1998956)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=1998956)[0m 1 | predictor | MLP          | 1.2 M  | train
[36m(RayTrainWorker pid=1998956)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=1998956)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=1998956)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=1998956)[0m 9.8 M     Trainable params
[36m(RayTrainWorker pid=1998956)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=1998956)[0m 9.8 M     Total params
[36m(RayTrainWorker pid=1998956)[0m 39.311    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2001135)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2001135)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2001135)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2001135)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2001135)[0m 
[36m(RayTrainWorker pid=2001135)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2001135)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2001135)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2001135)[0m 1 | predictor | MLP          | 890 K  | train
[36m(RayTrainWorker pid=2001135)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2001135)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2001135)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2001135)[0m 9.6 M     Trainable params
[36m(RayTrainWorker pid=2001135)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2001135)[0m 9.6 M     Total params
[36m(RayTrainWorker pid=2001135)[0m 38.259    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2003242)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2003242)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2003242)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2003242)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2003242)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2003242)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2003242)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2003242)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2003242)[0m 
[36m(RayTrainWorker pid=2003242)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2005487)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2005487)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2005487)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2005487)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2005487)[0m 
[36m(RayTrainWorker pid=2005487)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2005487)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2005487)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2005487)[0m 1 | predictor | MLP          | 99.8 K | train
[36m(RayTrainWorker pid=2005487)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2005487)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2005487)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2005487)[0m 8.8 M     Trainable params
[36m(RayTrainWorker pid=2005487)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2005487)[0m 8.8 M     Total params
[36m(RayTrainWorker pid=2005487)[0m 35.098    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2007632)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2007632)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2007632)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2007632)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2007632)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2007632)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2007632)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2007632)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2007632)[0m 
[36m(RayTrainWorker pid=2007632)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2009692)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2009692)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2009692)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2009692)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2009692)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2009692)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2009692)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2009692)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2009692)[0m 
[36m(RayTrainWorker pid=2009692)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2011738)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2011738)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2011738)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2011738)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2011738)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2011738)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2011738)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2011738)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2011738)[0m 
[36m(RayTrainWorker pid=2011738)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2014031)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2014031)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2014031)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2014031)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2014031)[0m 
[36m(RayTrainWorker pid=2014031)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2014031)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2014031)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2014031)[0m 1 | predictor | MLP          | 349 K  | train
[36m(RayTrainWorker pid=2014031)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2014031)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2014031)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2014031)[0m 9.0 M     Trainable params
[36m(RayTrainWorker pid=2014031)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2014031)[0m 9.0 M     Total params
[36m(RayTrainWorker pid=2014031)[0m 36.096    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2016224)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2016224)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2016224)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2016224)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2016224)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2016224)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2016224)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2016224)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2016224)[0m 
[36m(RayTrainWorker pid=2016224)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2018369)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2018369)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2018369)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2018369)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2018369)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2018369)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2018369)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2018369)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2018369)[0m 
[36m(RayTrainWorker pid=2018369)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2020389)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2020389)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2020389)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2020389)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2020389)[0m 
[36m(RayTrainWorker pid=2020389)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2020389)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2020389)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2020389)[0m 1 | predictor | MLP          | 49.9 K | train
[36m(RayTrainWorker pid=2020389)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2020389)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2020389)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2020389)[0m 8.7 M     Trainable params
[36m(RayTrainWorker pid=2020389)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2020389)[0m 8.7 M     Total params
[36m(RayTrainWorker pid=2020389)[0m 34.898    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2022364)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2022364)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2022364)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2022364)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2022364)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2022364)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2022364)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2022364)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2022364)[0m 
[36m(RayTrainWorker pid=2022364)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2024569)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2024569)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2024569)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2024569)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2024569)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2024569)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2024569)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2024569)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2024569)[0m 
[36m(RayTrainWorker pid=2024569)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2026624)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2026624)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2026624)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2026624)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2026624)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2026624)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2026624)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2026624)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2026624)[0m 
[36m(RayTrainWorker pid=2026624)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2028449)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2028449)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2028449)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2028449)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2028449)[0m 
[36m(RayTrainWorker pid=2028449)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2028449)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2028449)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2028449)[0m 1 | predictor | MLP          | 249 K  | train
[36m(RayTrainWorker pid=2028449)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2028449)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2028449)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2028449)[0m 8.9 M     Trainable params
[36m(RayTrainWorker pid=2028449)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2028449)[0m 8.9 M     Total params
[36m(RayTrainWorker pid=2028449)[0m 35.697    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2030746)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=2030746)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2030746)[0m Loading checkpoint from roberta to roberta...


[36m(RayTrainWorker pid=2030746)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2030746)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2030746)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2030746)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2030746)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2030746)[0m 
[36m(RayTrainWorker pid=2030746)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2032973)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2032973)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2032973)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2032973)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2032973)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2032973)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2032973)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2032973)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2032973)[0m 
[36m(RayTrainWorker pid=2032973)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2035077)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2035077)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2035077)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2035077)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2035077)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2035077)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2035077)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2035077)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2035077)[0m 
[36m(RayTrainWorker pid=2035077)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2037160)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2037160)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2037160)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2037160)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2037160)[0m 
[36m(RayTrainWorker pid=2037160)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2037160)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2037160)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2037160)[0m 1 | predictor | MLP          | 299 K  | train
[36m(RayTrainWorker pid=2037160)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2037160)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2037160)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2037160)[0m 9.0 M     Trainable params
[36m(RayTrainWorker pid=2037160)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2037160)[0m 9.0 M     Total params
[36m(RayTrainWorker pid=2037160)[0m 35.896    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2039359)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2039359)[0m <All keys matched successfully>
[36m(RayTrainWorker pid=2039359)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...


[36m(RayTrainWorker pid=2039359)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2039359)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2039359)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2039359)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2039359)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2039359)[0m 
[36m(RayTrainWorker pid=2039359)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2041513)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2041513)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2041513)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2041513)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2041513)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2041513)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2041513)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2041513)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2041513)[0m 
[36m(RayTrainWorker pid=2041513)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2043427)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2043427)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2043427)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2043427)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2043427)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2043427)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2043427)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2043427)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2043427)[0m 
[36m(RayTrainWorker pid=2043427)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2045625)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2045625)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2045625)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2045625)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2045625)[0m 
[36m(RayTrainWorker pid=2045625)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2045625)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2045625)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2045625)[0m 1 | predictor | MLP          | 299 K  | train
[36m(RayTrainWorker pid=2045625)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2045625)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2045625)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2045625)[0m 9.0 M     Trainable params
[36m(RayTrainWorker pid=2045625)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2045625)[0m 9.0 M     Total params
[36m(RayTrainWorker pid=2045625)[0m 35.896    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2047764)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2047764)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2047764)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2047764)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2047764)[0m 
[36m(RayTrainWorker pid=2047764)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2047764)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2047764)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2047764)[0m 1 | predictor | MLP          | 199 K  | train
[36m(RayTrainWorker pid=2047764)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2047764)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2047764)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2047764)[0m 8.9 M     Trainable params
[36m(RayTrainWorker pid=2047764)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2047764)[0m 8.9 M     Total params
[36m(RayTrainWorker pid=2047764)[0m 35.497    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2049648)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2049648)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2049648)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2049648)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2049648)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2049648)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2049648)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2049648)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2049648)[0m 
[36m(RayTrainWorker pid=2049648)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2051794)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2051794)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2051794)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2051794)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2051794)[0m 
[36m(RayTrainWorker pid=2051794)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2051794)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2051794)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2051794)[0m 1 | predictor | MLP          | 249 K  | train
[36m(RayTrainWorker pid=2051794)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2051794)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2051794)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2051794)[0m 8.9 M     Trainable params
[36m(RayTrainWorker pid=2051794)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2051794)[0m 8.9 M     Total params
[36m(RayTrainWorker pid=2051794)[0m 35.697    Total estimated model params size (MB)

[36m(RayTrainWorker pid=2053748)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2053748)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2053748)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2053748)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2053748)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2053748)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2053748)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2053748)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2053748)[0m 
[36m(RayTrainWorker pid=2053748)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2055758)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2055758)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2055758)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2055758)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2055758)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2055758)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2055758)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2055758)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2055758)[0m 
[36m(RayTrainWorker pid=2055758)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2057977)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2057977)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2057977)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2057977)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=2057977)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2057977)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2057977)[0m /opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
[36m(RayTrainWorker pid=2057977)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2057977)[0m 
[36m(RayTrainWorker pid=2057977)[0m   | Name      | Type         | Params | 

[36m(RayTrainWorker pid=2060101)[0m Getting checkpoint from /home/jupyter/AgenticADMET/notebooks/../output/artifacts/mol_mlm_roberta_zinc/last.ckpt...
[36m(RayTrainWorker pid=2060101)[0m Loading checkpoint from roberta to roberta...
[36m(RayTrainWorker pid=2060101)[0m <All keys matched successfully>


[36m(RayTrainWorker pid=2060101)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=2060101)[0m 
[36m(RayTrainWorker pid=2060101)[0m   | Name      | Type         | Params | Mode 
[36m(RayTrainWorker pid=2060101)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2060101)[0m 0 | roberta   | RobertaModel | 8.7 M  | train
[36m(RayTrainWorker pid=2060101)[0m 1 | predictor | MLP          | 99.8 K | train
[36m(RayTrainWorker pid=2060101)[0m 2 | criterion | MSE          | 0      | train
[36m(RayTrainWorker pid=2060101)[0m 3 | metrics   | ModuleList   | 0      | train
[36m(RayTrainWorker pid=2060101)[0m ---------------------------------------------------
[36m(RayTrainWorker pid=2060101)[0m 8.8 M     Trainable params
[36m(RayTrainWorker pid=2060101)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=2060101)[0m 8.8 M     Total params
[36m(RayTrainWorker pid=2060101)[0m 35.098    Total estimated model params size (MB)

## Hyperparameter optimization results

In [9]:
# results of all trials
result_df = results.get_dataframe()
results_df = result_df.sort_values('val/mae')
results_df

Unnamed: 0,train_loss,train_loss_step,val/mae,val/r2,val_loss,lr,train_loss_epoch,epoch,step,timestamp,...,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/hidden_dim,config/train_loop_config/num_layers,config/train_loop_config/batch_size,config/train_loop_config/weight_decay,config/train_loop_config/dropout,logdir
3,0.1332,0.137923,0.339789,0.747507,0.241228,0.0001,0.1332,49,300,1741130177,...,dl-vladvin-1,10.128.0.3,88.153677,50,256,1,64,0.043976,0.117963,c461b4c9
11,0.157563,0.169746,0.347316,0.727599,0.259311,0.0001,0.157563,49,150,1741130972,...,dl-vladvin-1,10.128.0.3,90.425359,50,640,1,128,0.099149,0.298617,c3ad0902
2,0.123199,0.122134,0.357081,0.720803,0.266413,0.0001,0.123199,49,300,1741130078,...,dl-vladvin-1,10.128.0.3,93.565034,50,128,1,64,1.3e-05,0.038566,6a72a8c7
13,0.096088,0.124491,0.359187,0.694326,0.294941,0.0001,0.096088,49,250,1741131166,...,dl-vladvin-1,10.128.0.3,84.077512,50,256,1,80,0.000111,0.004295,21f290ea
9,0.110213,0.094916,0.360723,0.707181,0.279922,0.0001,0.110213,49,400,1741130777,...,dl-vladvin-1,10.128.0.3,82.833722,50,512,1,48,0.000419,0.074021,6fa2aff5
7,0.14475,0.144648,0.36224,0.70083,0.284794,0.0001,0.14475,49,150,1741130580,...,dl-vladvin-1,10.128.0.3,88.413638,50,896,1,128,0.006943,0.12395,39b90ed6
5,0.104219,0.108169,0.368202,0.695156,0.294491,0.0001,0.104219,49,550,1741130378,...,dl-vladvin-1,10.128.0.3,92.444897,50,384,1,32,0.00081,0.051204,4182d5be
17,0.168353,0.17654,0.370131,0.708055,0.277916,0.0001,0.168353,49,150,1741131585,...,dl-vladvin-1,10.128.0.3,94.376309,50,640,1,128,0.098635,0.294988,cf25ab34
23,0.128436,0.146669,0.370756,0.688128,0.29796,0.0001,0.128436,49,300,1741132188,...,dl-vladvin-1,10.128.0.3,81.89663,50,512,1,64,0.011764,0.203736,f140e8eb
27,0.135254,0.139026,0.371937,0.682045,0.302676,0.0001,0.135254,49,200,1741132586,...,dl-vladvin-1,10.128.0.3,92.630806,50,768,1,96,0.005041,0.076212,f38d6332


In [10]:
results.get_best_result(metric="val/mae", mode="min")

Result(
  metrics={'train_loss': 0.1332000344991684, 'train_loss_step': 0.1379225105047226, 'val/mae': 0.33978864550590515, 'val/r2': 0.747506856918335, 'val_loss': 0.241227924823761, 'lr': 9.999999747378752e-05, 'train_loss_epoch': 0.1332000344991684, 'epoch': 49, 'step': 300},
  path='/home/jupyter/AgenticADMET/notebooks/../output/asap/rnd_splits/roberta/run_0/split_0/hpopt/ray_results/TorchTrainer_2025-03-04_23-09-38/c461b4c9',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/home/jupyter/AgenticADMET/notebooks/../output/asap/rnd_splits/roberta/run_0/split_0/hpopt/ray_results/TorchTrainer_2025-03-04_23-09-38/c461b4c9/checkpoint_000049)
)

In [13]:
# results_df.to_csv(hpopt_save_dir / 'results3.csv', index=False)

In [12]:
ray.shutdown()