In [1]:
from collections import defaultdict
import json
from pathlib import Path
import sys

import pandas as pd
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger
import numpy as np
import torch
import wandb

from chemprop import data, featurizers, models, nn

sys.path.insert(0, '../agenticadmet')
from eval import eval_admet, extract_preds, extract_refs

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
NUM_WORKERS = 0 # number of workers for dataloader. 0 means using main process for data loading
SMILES_COLUMN = 'smiles_std'
TARGET_COLUMNS = ['LogHLM', 'LogMLM', 'LogD', 'LogKSOL', 'LogMDR1-MDCKII']

In [4]:
def prepare_data(input_df, smiles_column_for_pred: str = SMILES_COLUMN):
    train_data, val_data = [], []
    for _, row in input_df.iterrows():
        dp = data.MoleculeDatapoint.from_smi(row[SMILES_COLUMN], row[TARGET_COLUMNS].values)
        if row['split'] == 'train':
            train_data.append(dp)
        elif row['split'] == 'val':
            val_data.append(dp)

    pred_data = []
    for _, row in input_df.iterrows():
        dp = data.MoleculeDatapoint.from_smi(row[smiles_column_for_pred], row[TARGET_COLUMNS].values)
        pred_data.append(dp)

    featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

    train_dset = data.MoleculeDataset(train_data, featurizer)
    # scaler = train_dset.normalize_targets()

    val_dset = data.MoleculeDataset(val_data, featurizer)
    # val_dset.normalize_targets(scaler)

    pred_dset = data.MoleculeDataset(pred_data, featurizer)

    return train_dset, val_dset, pred_dset

In [5]:
def train_model(config, train_dset, val_dset, num_workers, save_dir, run_idx, enable_logger=True):
    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])
    dropout = float(config["dropout"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth, dropout=dropout)
    agg = nn.MeanAggregation()
    ffn = nn.RegressionFFN(
        n_tasks=len(TARGET_COLUMNS),
        output_transform=None, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers,
        dropout=dropout
    )
    batch_norm = True
    metric_list = [nn.metrics.MAE(), nn.metrics.R2Score()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    ckpt_callback = ModelCheckpoint(
        save_top_k=0,
        save_last=True
    )

    if enable_logger:
        exp_name = f"chemprop_run_{run_idx}"
        logger = WandbLogger(
            project="admet-challenge",
            name=exp_name,
            prefix=f"{save_dir.stem}",
            save_dir=f"../wandb/{exp_name}"
        )
    else:
        logger = None

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=200,
        enable_progress_bar=False,
        callbacks=[ckpt_callback],
        default_root_dir=save_dir,
        logger=logger
    )

    try:
        trainer.fit(model, train_loader, val_loader)
    except Exception as e:
        if logger is not None:
            logger.finalize("failed")
            wandb.finish(exit_code=1)
        raise e
    else:
        if logger is not None:
            logger.finalize("success")

    return model

def predict(model, pred_dset, num_workers):
    pred_loader = data.build_dataloader(pred_dset, num_workers=num_workers, shuffle=False)
    
    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        enable_progress_bar=False
    )

    model.eval()
    preds = trainer.predict(model, pred_loader, return_predictions=True)
    preds = torch.cat(preds)

    return preds

In [6]:
MODEL_CONFIG = {
    "depth": 4,
    "ffn_hidden_dim": 500,
    "ffn_num_layers": 1,
    "message_hidden_dim": 1000,
    "dropout": 0.2
}

In [7]:
def train_and_eval(input_paths, save_dirs, run_idx):
    for input_path, save_dir in zip(input_paths, save_dirs):
        print(f"Training and predicting on {input_path}")
        input_df = pd.read_csv(input_path)
        train_dset, val_dset, pred_dset = prepare_data(input_df)
        model = train_model(MODEL_CONFIG, train_dset, val_dset, NUM_WORKERS, save_dir, run_idx)
        preds = predict(model, pred_dset, NUM_WORKERS)

        output_df = input_df.copy()
        output_df[["pred_" + t for t in TARGET_COLUMNS]] = preds
        save_dir.mkdir(parents=True, exist_ok=True)
        output_df.to_csv(save_dir / "predictions.csv", index=False)

        train_preds = extract_preds(output_df[input_df["split"] == "train"])
        train_refs = extract_refs(input_df[input_df["split"] == "train"])
        val_preds = extract_preds(output_df[input_df["split"] == "val"])
        val_refs = extract_refs(input_df[input_df["split"] == "val"])

        metrics = eval_admet(train_preds, train_refs)
        print("Train metrics:")
        print(json.dumps(metrics, indent=2))

        metrics = eval_admet(val_preds, val_refs)
        print("\nVal metrics:")
        print(json.dumps(metrics, indent=2))
    
    wandb.finish(exit_code=0)

## Run 0

In [8]:
input_paths = [Path(f'../data/asap/datasets/rnd_splits/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_0/split_{k}') for k in range(5)]
RUN_IDX = 0

In [9]:
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../data/asap/datasets/rnd_splits/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvladvin111[0m ([33mvladvin-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.24934580146981947,
    "r2": 0.7725539578721019
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.14027972797661595,
    "r2": 0.7859417225647726
  },
  "MLM": {
    "mean_absolute_error": 0.2213262028144005,
    "r2": 0.8007038226326904
  },
  "HLM": {
    "mean_absolute_error": 0.212618682984524,
    "r2": 0.7750497069335556
  },
  "LogD": {
    "mean_absolute_error": 0.27666164881640376,
    "r2": 0.9166590204103382
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2200464128123527,
    "macro_r2": 0.8101816460826917
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.45419421337664967,
    "r2": 0.2237504750256415
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2271353346828401,
    "r2": 0.4693530487277412
  },
  "MLM": {
    "mean_absolute_error": 0.3128545245774797,
    "r2": 0.5870099105461248
  },
  "HLM": {
    "mean_absolute_error": 0.36991042838739857,
    "r2": 0.21709282186170376
  },
  "

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2597171204692286,
    "r2": 0.7659776510220662
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.14035346872335325,
    "r2": 0.7707043996857273
  },
  "MLM": {
    "mean_absolute_error": 0.22020261260545207,
    "r2": 0.7822821667770143
  },
  "HLM": {
    "mean_absolute_error": 0.19832394134362274,
    "r2": 0.7932069513464481
  },
  "LogD": {
    "mean_absolute_error": 0.3714193066576171,
    "r2": 0.8537496324497851
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.23800328995985476,
    "macro_r2": 0.7931841602562082
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.40462325433754476,
    "r2": 0.35663543802280806
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1924591020089494,
    "r2": 0.4205129316470587
  },
  "MLM": {
    "mean_absolute_error": 0.38706234270826023,
    "r2": 0.4200991579122768
  },
  "HLM": {
    "mean_absolute_error": 0.35286920369671865,
    "r2": 0.4250274157433831
  },


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.22239809234134447,
    "r2": 0.8010811162812254
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.14508306489088602,
    "r2": 0.7616717931794632
  },
  "MLM": {
    "mean_absolute_error": 0.18794078301477876,
    "r2": 0.8612627619426025
  },
  "HLM": {
    "mean_absolute_error": 0.19922002038038833,
    "r2": 0.7940652741932519
  },
  "LogD": {
    "mean_absolute_error": 0.25605372717515346,
    "r2": 0.9239201983702174
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.20213913756051022,
    "macro_r2": 0.8284002287933522
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.41188724961656714,
    "r2": 0.410622246540841
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1933379131714535,
    "r2": 0.531979215463223
  },
  "MLM": {
    "mean_absolute_error": 0.4099410541752838,
    "r2": 0.19285531565742398
  },
  "HLM": {
    "mean_absolute_error": 0.36553200509293693,
    "r2": 0.21284057376483267
  },


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.19886306573587004,
    "r2": 0.8430739581497567
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.14166601172282145,
    "r2": 0.7832225861029515
  },
  "MLM": {
    "mean_absolute_error": 0.21002362077586564,
    "r2": 0.8217191246310506
  },
  "HLM": {
    "mean_absolute_error": 0.19743713744578043,
    "r2": 0.7895211403727999
  },
  "LogD": {
    "mean_absolute_error": 0.30480210972927774,
    "r2": 0.9055993503574271
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.21055838908192306,
    "macro_r2": 0.8286272319227972
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.28261446816765284,
    "r2": 0.611528765537125
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.21922521733119443,
    "r2": 0.3579301102233834
  },
  "MLM": {
    "mean_absolute_error": 0.35622374330322265,
    "r2": 0.3717045156913601
  },
  "HLM": {
    "mean_absolute_error": 0.33394541963498114,
    "r2": 0.41302280465186425
  }

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2025360460620371,
    "r2": 0.8368626139038552
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.12681379350238434,
    "r2": 0.8156879900339248
  },
  "MLM": {
    "mean_absolute_error": 0.197558901058282,
    "r2": 0.8494635447756685
  },
  "HLM": {
    "mean_absolute_error": 0.19031125405008548,
    "r2": 0.8263062312788162
  },
  "LogD": {
    "mean_absolute_error": 0.2234576534993889,
    "r2": 0.9507129151952978
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.18813552963443553,
    "macro_r2": 0.8558066590375125
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3533265182085088,
    "r2": 0.35187100498644586
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1914119530840047,
    "r2": 0.5412721876625946
  },
  "MLM": {
    "mean_absolute_error": 0.36125022152485914,
    "r2": 0.37399337255815346
  },
  "HLM": {
    "mean_absolute_error": 0.30024156225054494,
    "r2": 0.37048478261976137
  },
 

0,1
split_0-epoch,▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇████
split_0-train_loss_epoch,█▄▃▂▂▂▁▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,▇█▅▅▄▁▄▂█▂▂▂▁▂▂▁▁▂▁▁▁▁▁▃
split_0-val/mae,▂▄█▃▃▆▄▁▂▂▂▁▂▂▅▁▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▇▆▆▁▇███████████████████████████████████
split_0-val_loss,▆▄▅█▄▆▄█▂▄▂▁▄▃▂▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇████
split_1-train_loss_epoch,▇█▆▆▇▅▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
split_1-train_loss_step,▇▆█▅▃▃▃▃▆▂▂▁▂▂▂▂▂█▂▂▄▁▁▅
split_1-val/mae,▂▂▄▅▆█▆▃▃▄▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.10783
split_0-train_loss_step,0.26311
split_0-val/mae,0.35689
split_0-val/r2,0.75855
split_0-val_loss,0.22458
split_1-epoch,199.0
split_1-train_loss_epoch,0.12155
split_1-train_loss_step,0.38707
split_1-val/mae,0.36949


## Cleaning up + run 1

In [10]:
def clean_data(input_paths, save_dirs, output_dir, remove_worst_pct):
    smiles_to_remove = defaultdict(set)

    for input_path, save_dir in zip(input_paths, save_dirs):
        input_df = pd.read_csv(input_path)
        input_val_df = input_df[input_df["split"] == "val"]
        output_df = pd.read_csv(save_dir / "predictions.csv")
        output_val_df = output_df[input_df["split"] == "val"]

        for t in TARGET_COLUMNS:
            # Sort by absolute error
            notna_mask = input_val_df[t].notna()
            input_val_df = input_val_df[notna_mask]
            output_val_df = output_val_df[notna_mask]

            mae = np.abs(input_val_df[t] - output_val_df[f"pred_{t}"])
            sorted_idx = np.argsort(mae)[::-1]
            smiles_to_remove[t].update(
                input_val_df.iloc[sorted_idx[:int(remove_worst_pct * len(sorted_idx))]]["cxsmiles_std"].tolist()
            )

    for input_path in input_paths:
        input_df = pd.read_csv(input_path)
        for t in TARGET_COLUMNS:
            input_df.loc[input_df["cxsmiles_std"].isin(smiles_to_remove[t]) & (input_df["split"] == "train"), t] = np.nan

        input_df.to_csv(output_dir / input_path.name, index=False)

In [11]:
output_dir = Path("../output/asap/rnd_splits/chemprop/run_0/cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

In [12]:
clean_data(input_paths, save_dirs, output_dir, remove_worst_pct = 0.2)

In [13]:
pd.read_csv(input_paths[0])[TARGET_COLUMNS].isna().sum()

LogHLM            150
LogMLM            140
LogD               86
LogKSOL            74
LogMDR1-MDCKII     15
dtype: int64

In [14]:
pd.read_csv(output_dir / input_paths[0].name)[TARGET_COLUMNS].isna().sum()

LogHLM            189
LogMLM            173
LogD              116
LogKSOL            99
LogMDR1-MDCKII     40
dtype: int64

In [15]:
input_paths = [Path(f'../output/asap/rnd_splits/chemprop/run_0/cleaned/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_1/split_{k}') for k in range(5)]
RUN_IDX = "1_clean_worst_pct_0.2"

In [16]:
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../output/asap/rnd_splits/chemprop/run_0/cleaned/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.24015611684701654,
    "r2": 0.7277680487621945
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1534144232931043,
    "r2": 0.7221598660470849
  },
  "MLM": {
    "mean_absolute_error": 0.2871379576097833,
    "r2": 0.5646091415023183
  },
  "HLM": {
    "mean_absolute_error": 0.2782443940385389,
    "r2": 0.5469261018652077
  },
  "LogD": {
    "mean_absolute_error": 0.2991523738566474,
    "r2": 0.8950816081063567
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.25162105312901806,
    "macro_r2": 0.6913089532566324
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.367016727560697,
    "r2": 0.4084333295557705
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2208954740841836,
    "r2": 0.4583291930233465
  },
  "MLM": {
    "mean_absolute_error": 0.32447410513977104,
    "r2": 0.5392675434857483
  },
  "HLM": {
    "mean_absolute_error": 0.31839985537969734,
    "r2": 0.3832657549417898
  },
  "Lo

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.239763939454739,
    "r2": 0.718956939489979
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16254419235724782,
    "r2": 0.6610993841336104
  },
  "MLM": {
    "mean_absolute_error": 0.27290543295096886,
    "r2": 0.5547813315166519
  },
  "HLM": {
    "mean_absolute_error": 0.25975063128350007,
    "r2": 0.5585812406368915
  },
  "LogD": {
    "mean_absolute_error": 0.2956405307656272,
    "r2": 0.9048673559478665
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2461209453624166,
    "macro_r2": 0.6796572503449998
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3360772654228352,
    "r2": 0.46826859316157676
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.21372770360833493,
    "r2": 0.4977918329815898
  },
  "MLM": {
    "mean_absolute_error": 0.3578487288641999,
    "r2": 0.5228502619881628
  },
  "HLM": {
    "mean_absolute_error": 0.33441213505979733,
    "r2": 0.49423291118806245
  },
  "

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.23936657175396683,
    "r2": 0.7357898923761756
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.17470872390403855,
    "r2": 0.6336310064210433
  },
  "MLM": {
    "mean_absolute_error": 0.2741909687200981,
    "r2": 0.6294886394326302
  },
  "HLM": {
    "mean_absolute_error": 0.266847701113405,
    "r2": 0.52591974747608
  },
  "LogD": {
    "mean_absolute_error": 0.2257889774082495,
    "r2": 0.9405456052427525
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.23618058857995158,
    "macro_r2": 0.6930749781897363
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3985827886183202,
    "r2": 0.35381222087719144
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.19643224773741708,
    "r2": 0.5692178072998397
  },
  "MLM": {
    "mean_absolute_error": 0.38419072464981757,
    "r2": 0.2390758313166078
  },
  "HLM": {
    "mean_absolute_error": 0.35399984197532197,
    "r2": 0.3213320053374542
  },
  "L

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.23873198642278523,
    "r2": 0.7220947085170473
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1681929594764098,
    "r2": 0.6701224029138255
  },
  "MLM": {
    "mean_absolute_error": 0.2875885808570273,
    "r2": 0.5621061489877364
  },
  "HLM": {
    "mean_absolute_error": 0.2545136414043143,
    "r2": 0.5607239295233242
  },
  "LogD": {
    "mean_absolute_error": 0.2791234872453416,
    "r2": 0.9195677156953541
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.24563013108117565,
    "macro_r2": 0.6869229811274575
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3466090227340552,
    "r2": 0.38885276933383683
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.22923648777433067,
    "r2": 0.2642125136987167
  },
  "MLM": {
    "mean_absolute_error": 0.32128864770514737,
    "r2": 0.479287750573418
  },
  "HLM": {
    "mean_absolute_error": 0.32152775040593756,
    "r2": 0.4646923445418015
  },
  "

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2049501458182054,
    "r2": 0.7585603046925538
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.13932446158954812,
    "r2": 0.7315145823252982
  },
  "MLM": {
    "mean_absolute_error": 0.2791512257430599,
    "r2": 0.5498500714616518
  },
  "HLM": {
    "mean_absolute_error": 0.2791168777241887,
    "r2": 0.5456720301206758
  },
  "LogD": {
    "mean_absolute_error": 0.28398838515082997,
    "r2": 0.9196906461972821
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2373062192051664,
    "macro_r2": 0.7010575269594923
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.29527388987570835,
    "r2": 0.3205603413159307
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1933578214250906,
    "r2": 0.5561999529093957
  },
  "MLM": {
    "mean_absolute_error": 0.330339871319223,
    "r2": 0.5875092958078285
  },
  "HLM": {
    "mean_absolute_error": 0.2689108963354894,
    "r2": 0.5294454536041988
  },
  "Log

0,1
split_0-epoch,▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
split_0-train_loss_epoch,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,▇▃▄▃▃▃▂▃▁▂▂▅▂▂█▁▁▂▁▁▃▁▁▆
split_0-val/mae,▄█▇▄▆▂▃▄▂▂▂▂▂▁▁▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▁▄▂▆▇▇▇█▇▆███▇██████████████████████████
split_0-val_loss,▃▆█▃▃▃▆▁▄▂▃▂▂▂▂▄▃▂▂▂▂▁▁▃▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
split_1-train_loss_epoch,█▆▇▆▆▅▅▅▅▄▃▃▃▃▃▃▃▂▂▂▂▂▃▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁
split_1-train_loss_step,▆▄▃▄▄▄▃▂▆▂▂▁▂▂█▂▂▂▁▁▃▁▁▂
split_1-val/mae,▄▇▅█▅▃▅▂▃▂▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.11152
split_0-train_loss_step,0.46332
split_0-val/mae,0.34401
split_0-val/r2,0.75845
split_0-val_loss,0.22467
split_1-epoch,199.0
split_1-train_loss_epoch,0.09389
split_1-train_loss_step,0.13279
split_1-val/mae,0.35022


## Cleaning up + run 2

In [17]:
output_dir = Path("../output/asap/rnd_splits/chemprop/run_1/cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

In [18]:
clean_data(input_paths, save_dirs, output_dir, remove_worst_pct = 0.2)

In [19]:
input_paths = [Path(f'../output/asap/rnd_splits/chemprop/run_1/cleaned/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_2/split_{k}') for k in range(5)]
RUN_IDX = "2_clean_worst_pct_0.2"

In [20]:
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../output/asap/rnd_splits/chemprop/run_1/cleaned/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.24346469708077964,
    "r2": 0.7131768486305325
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15842682701434382,
    "r2": 0.6773955567462315
  },
  "MLM": {
    "mean_absolute_error": 0.3058049375590547,
    "r2": 0.46527251829799354
  },
  "HLM": {
    "mean_absolute_error": 0.267993828477452,
    "r2": 0.569569199686726
  },
  "LogD": {
    "mean_absolute_error": 0.3013011177852054,
    "r2": 0.8886175239581221
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.25539828158336714,
    "macro_r2": 0.6628063294639212
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.4006456045916631,
    "r2": 0.3450002614749901
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2434898010047162,
    "r2": 0.34817773525262485
  },
  "MLM": {
    "mean_absolute_error": 0.33359431687717434,
    "r2": 0.5150008190102775
  },
  "HLM": {
    "mean_absolute_error": 0.3319844486754105,
    "r2": 0.2717838677660136
  },
  "L

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2454548955976915,
    "r2": 0.6907670629360063
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16194189071765344,
    "r2": 0.6588450402124902
  },
  "MLM": {
    "mean_absolute_error": 0.29570088901290087,
    "r2": 0.4939481534996508
  },
  "HLM": {
    "mean_absolute_error": 0.2815622016162154,
    "r2": 0.46986338244226167
  },
  "LogD": {
    "mean_absolute_error": 0.28059066100981817,
    "r2": 0.9071803260675587
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2530501075908559,
    "macro_r2": 0.6441207930315935
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.35114818942470877,
    "r2": 0.4864082272532795
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.21829884748399783,
    "r2": 0.3995759875826008
  },
  "MLM": {
    "mean_absolute_error": 0.3774881402469697,
    "r2": 0.5323176519858472
  },
  "HLM": {
    "mean_absolute_error": 0.30583415203503134,
    "r2": 0.526827869020662
  },
  

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.23118985424088492,
    "r2": 0.7313856654611756
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16329314750233623,
    "r2": 0.6755545910157368
  },
  "MLM": {
    "mean_absolute_error": 0.2876429816178054,
    "r2": 0.5933415909727404
  },
  "HLM": {
    "mean_absolute_error": 0.2680435869908554,
    "r2": 0.516722104815347
  },
  "LogD": {
    "mean_absolute_error": 0.22792288968064603,
    "r2": 0.9342549713844119
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2356184920065056,
    "macro_r2": 0.6902517847298825
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.39503960558916795,
    "r2": 0.42926683055567094
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.18596303590285448,
    "r2": 0.6159194712627734
  },
  "MLM": {
    "mean_absolute_error": 0.3927535490065106,
    "r2": 0.18374885901038274
  },
  "HLM": {
    "mean_absolute_error": 0.34404719537536965,
    "r2": 0.3951809129436581
  },
 

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.24340159706790385,
    "r2": 0.6756498243255233
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16227727556321905,
    "r2": 0.6831073685549913
  },
  "MLM": {
    "mean_absolute_error": 0.2924313931602176,
    "r2": 0.5529260657521982
  },
  "HLM": {
    "mean_absolute_error": 0.27514881356809756,
    "r2": 0.5293073533901498
  },
  "LogD": {
    "mean_absolute_error": 0.24593859137107546,
    "r2": 0.9300150242132186
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.24383953414610268,
    "macro_r2": 0.6742011272472163
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3269344534760019,
    "r2": 0.45515596165625916
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.22588045580928393,
    "r2": 0.2997300518555833
  },
  "MLM": {
    "mean_absolute_error": 0.36924385144528776,
    "r2": 0.3560280653733774
  },
  "HLM": {
    "mean_absolute_error": 0.3691283084644681,
    "r2": 0.37389176175586847
  },

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2287156700593006,
    "r2": 0.6923940862089433
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15664005456949467,
    "r2": 0.6753168596390544
  },
  "MLM": {
    "mean_absolute_error": 0.2946000325031204,
    "r2": 0.4975313640715525
  },
  "HLM": {
    "mean_absolute_error": 0.2700407740982986,
    "r2": 0.5452323337784695
  },
  "LogD": {
    "mean_absolute_error": 0.24246397003427494,
    "r2": 0.9346670834396402
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.23849210025289785,
    "macro_r2": 0.6690283454275321
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2965531525227481,
    "r2": 0.5934587103305118
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.18132861378242054,
    "r2": 0.6173007017801453
  },
  "MLM": {
    "mean_absolute_error": 0.32030707243915624,
    "r2": 0.5523443437524773
  },
  "HLM": {
    "mean_absolute_error": 0.23201874473591116,
    "r2": 0.6056849768586514
  },
  

0,1
split_0-epoch,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▇▇▇▇▇▇▇▇██
split_0-train_loss_epoch,█▇▆▆▄▅▃▃▃▃▂▂▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,▄▃▇▂▃▅▂▂█▂▂▁▂▁▂▂▁▃▁▁▂▁▁▂
split_0-val/mae,█▇▃▃▂▃▄▂▃▄▄▃▂▂▂▄▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▆▃▆▁▄▇▅▇▅▇▅▇▇▇██▇▇█▇████████████████████
split_0-val_loss,▄█▇▅▂▂▃▂▃▄▄▁▂▂▆▂▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
split_1-train_loss_epoch,█▃▄▃▃▃▃▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-train_loss_step,▃▃█▂▂▂▂▂▃▂▁▂▁▁▂▁▁▁▁▁▂▁▁▁
split_1-val/mae,▂▃▆▃▃█▃▂▂▅▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.0836
split_0-train_loss_step,0.15101
split_0-val/mae,0.3661
split_0-val/r2,0.72699
split_0-val_loss,0.25393
split_1-epoch,199.0
split_1-train_loss_epoch,0.08088
split_1-train_loss_step,0.11246
split_1-val/mae,0.35316


## Cleaning up stero impure + run 1

In [21]:
def clean_data(input_paths, save_dirs, output_dir, remove_worst_pct):
    smiles_to_remove = defaultdict(set)

    for input_path, save_dir in zip(input_paths, save_dirs):
        input_df = pd.read_csv(input_path)
        input_val_df = input_df[input_df["split"] == "val"]
        output_df = pd.read_csv(save_dir / "predictions.csv")
        output_val_df = output_df[input_df["split"] == "val"]

        for t in TARGET_COLUMNS:
            # Sort by absolute error
            notna_mask = input_val_df[t].notna()
            input_val_df = input_val_df[notna_mask]
            output_val_df = output_val_df[notna_mask]

            mae = np.abs(input_val_df[t] - output_val_df[f"pred_{t}"])
            sorted_idx = np.argsort(mae)[::-1]
            smiles_to_remove[t].update(
                input_val_df.iloc[sorted_idx[:int(remove_worst_pct * len(sorted_idx))]]["cxsmiles_std"].tolist()
            )

    for input_path in input_paths:
        input_df = pd.read_csv(input_path)
        for t in TARGET_COLUMNS:
            input_df.loc[
                input_df["cxsmiles_std"].isin(smiles_to_remove[t]) & \
                    ~input_df["smiles_ext"].isna() & \
                    (input_df["split"] == "train"),
                t
            ] = np.nan

        input_df.to_csv(output_dir / input_path.name, index=False)

In [22]:
output_dir = Path("../output/asap/rnd_splits/chemprop/run_0/cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

In [23]:
clean_data(input_paths, save_dirs, output_dir, remove_worst_pct = 0.2)

In [24]:
pd.read_csv(input_paths[0])[TARGET_COLUMNS].isna().sum()

LogHLM            203
LogMLM            186
LogD              129
LogKSOL           107
LogMDR1-MDCKII     50
dtype: int64

In [25]:
tmp = pd.read_csv(input_paths[0])
tmp = tmp[tmp["smiles_ext"].isna()]
tmp[TARGET_COLUMNS].isna().sum()

LogHLM            103
LogMLM             85
LogD               67
LogKSOL            57
LogMDR1-MDCKII     32
dtype: int64

In [26]:
pd.read_csv(output_dir / input_paths[0].name)[TARGET_COLUMNS].isna().sum()

LogHLM            206
LogMLM            187
LogD              134
LogKSOL           107
LogMDR1-MDCKII     52
dtype: int64

In [27]:
tmp = pd.read_csv(output_dir / input_paths[0].name)
tmp = tmp[tmp["smiles_ext"].isna()]
tmp[TARGET_COLUMNS].isna().sum()

LogHLM            103
LogMLM             85
LogD               67
LogKSOL            57
LogMDR1-MDCKII     32
dtype: int64

In [28]:
input_paths = [Path(f'../output/asap/rnd_splits/chemprop/run_0/cleaned/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_1/split_{k}') for k in range(5)]
RUN_IDX = "1_clean_worst_pct_0.2_stereo_impure"

In [29]:
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../output/asap/rnd_splits/chemprop/run_0/cleaned/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.21632927197585794,
    "r2": 0.7294461362807467
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15007996537792595,
    "r2": 0.7143767832557743
  },
  "MLM": {
    "mean_absolute_error": 0.2922816246366181,
    "r2": 0.512002868460343
  },
  "HLM": {
    "mean_absolute_error": 0.2826572382615165,
    "r2": 0.5328559838451612
  },
  "LogD": {
    "mean_absolute_error": 0.23924801416231686,
    "r2": 0.9212684263972968
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.23611922288284704,
    "macro_r2": 0.6819900396478644
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.34918324683856716,
    "r2": 0.40315730186642784
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.22938675015512244,
    "r2": 0.4006908298752314
  },
  "MLM": {
    "mean_absolute_error": 0.3491726612623635,
    "r2": 0.5152997396276542
  },
  "HLM": {
    "mean_absolute_error": 0.344541862135246,
    "r2": 0.21959466204864186
  },
  

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2254630497212826,
    "r2": 0.7137079744943886
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16145584634618565,
    "r2": 0.6643658046468527
  },
  "MLM": {
    "mean_absolute_error": 0.2941516681266022,
    "r2": 0.5120290491750867
  },
  "HLM": {
    "mean_absolute_error": 0.2662975315536405,
    "r2": 0.5180148370129751
  },
  "LogD": {
    "mean_absolute_error": 0.25007193498385766,
    "r2": 0.9273841682604402
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2394880061463137,
    "macro_r2": 0.6671003667179487
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.33741185782485306,
    "r2": 0.42437636812382307
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.20475108683600546,
    "r2": 0.45212774608877504
  },
  "MLM": {
    "mean_absolute_error": 0.3809211992688627,
    "r2": 0.5004446594165916
  },
  "HLM": {
    "mean_absolute_error": 0.3232393982649962,
    "r2": 0.5528326862998493
  },
  

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.23700609736264228,
    "r2": 0.7093467328806708
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1667431391334219,
    "r2": 0.6256158638489431
  },
  "MLM": {
    "mean_absolute_error": 0.30877770531057763,
    "r2": 0.5562916001824039
  },
  "HLM": {
    "mean_absolute_error": 0.2971938161616641,
    "r2": 0.458058791228806
  },
  "LogD": {
    "mean_absolute_error": 0.2623995648504456,
    "r2": 0.9133256811781736
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2544240645637503,
    "macro_r2": 0.6525277338637995
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.40301672406482647,
    "r2": 0.393031262101718
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.19122622948686493,
    "r2": 0.5621169252746985
  },
  "MLM": {
    "mean_absolute_error": 0.4019877973703594,
    "r2": 0.2552873850044083
  },
  "HLM": {
    "mean_absolute_error": 0.3537648122347687,
    "r2": 0.3892462230744399
  },
  "Log

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.2767953331915166,
    "r2": 0.6301204039984402
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.16028745761682114,
    "r2": 0.663433321432642
  },
  "MLM": {
    "mean_absolute_error": 0.29833642665877813,
    "r2": 0.5471326171089472
  },
  "HLM": {
    "mean_absolute_error": 0.2769237734504908,
    "r2": 0.5264360197369312
  },
  "LogD": {
    "mean_absolute_error": 0.25114376276804534,
    "r2": 0.9280584517132433
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2526973507371304,
    "macro_r2": 0.6590361627980408
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3253866818640561,
    "r2": 0.492893794248499
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.21150653203036016,
    "r2": 0.37354413875784864
  },
  "MLM": {
    "mean_absolute_error": 0.3318158526468605,
    "r2": 0.45550338997885087
  },
  "HLM": {
    "mean_absolute_error": 0.31968172401595774,
    "r2": 0.46536534357509096
  },
  

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.24210467201511093,
    "r2": 0.7016332287647102
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15116976418184577,
    "r2": 0.6932880332035559
  },
  "MLM": {
    "mean_absolute_error": 0.314101288664142,
    "r2": 0.4758054160296268
  },
  "HLM": {
    "mean_absolute_error": 0.3148615278888134,
    "r2": 0.45912712842205194
  },
  "LogD": {
    "mean_absolute_error": 0.23585673976796012,
    "r2": 0.9387410778123851
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.25161879850357444,
    "macro_r2": 0.6537189768464661
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.3258917271420149,
    "r2": 0.447921992942513
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.20576545878646824,
    "r2": 0.5563936798379261
  },
  "MLM": {
    "mean_absolute_error": 0.31862421782253275,
    "r2": 0.5426390957304861
  },
  "HLM": {
    "mean_absolute_error": 0.2678625225794306,
    "r2": 0.551097278305885
  },
  "L

0,1
split_0-epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇███
split_0-train_loss_epoch,█▅▅▅▄▄▄▄▃▄▂▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,▇▅█▃▂▃▂▄▆▃▂█▂▂▂▁▁▄▁▂▃▁▁█
split_0-val/mae,▂▂▂▄▄▃█▄▄▂▂▂▂▁▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▇█▄▇▁████████▇██████████████████████████
split_0-val_loss,▁▂█▂▃▅▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇███
split_1-train_loss_epoch,█▆▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-train_loss_step,█▆▄▇▃▄▃▃▅▃▃▇▂▂▂▂▂▃▂▁▂▁▂▃
split_1-val/mae,▃▂█▄▄▄▂▂▂▃▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.09074
split_0-train_loss_step,0.57636
split_0-val/mae,0.33734
split_0-val/r2,0.76842
split_0-val_loss,0.21539
split_1-epoch,199.0
split_1-train_loss_epoch,0.08518
split_1-train_loss_step,0.15682
split_1-val/mae,0.35515


## Removing all stereo impure + run 0

In [30]:
input_paths = [Path(f'../data/asap/datasets/rnd_splits/split_{k}.csv') for k in range(5)]
output_dir = Path("../data/asap/datasets/rnd_splits/stereo_pure")
output_dir.mkdir(parents=True, exist_ok=True)

for input_path in input_paths:
    input_df = pd.read_csv(input_path)
    input_df = pd.concat([
        input_df[(input_df["split"] == "train") & input_df["smiles_ext"].isna()],
        input_df[input_df["split"] == "val"]
    ])
    input_df.to_csv(output_dir / input_path.name, index=False)

In [31]:
input_paths = [Path(f'../data/asap/datasets/rnd_splits/stereo_pure/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_0/stereo_pure/split_{k}') for k in range(5)]
RUN_IDX = "0_stereo_pure"
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../data/asap/datasets/rnd_splits/stereo_pure/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (3) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.12030690768355291,
    "r2": 0.9563927467311923
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.09372683182009414,
    "r2": 0.8877512309931737
  },
  "MLM": {
    "mean_absolute_error": 0.12718344824401792,
    "r2": 0.9416204741891571
  },
  "HLM": {
    "mean_absolute_error": 0.12545981667105635,
    "r2": 0.9356275576482681
  },
  "LogD": {
    "mean_absolute_error": 0.15688534669692702,
    "r2": 0.9730673835140098
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.12471247022312966,
    "macro_r2": 0.9388918786151603
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.5680448231825244,
    "r2": -0.09786599411764163
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.29273192821805477,
    "r2": 0.2424324464193509
  },
  "MLM": {
    "mean_absolute_error": 0.4911846075775767,
    "r2": 0.1361321479552149
  },
  "HLM": {
    "mean_absolute_error": 0.5211672928871471,
    "r2": -0.4134754428766063
  }

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (3) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.11660677789413042,
    "r2": 0.9591897634784995
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.10178770163604664,
    "r2": 0.8740749461495877
  },
  "MLM": {
    "mean_absolute_error": 0.11841434787636623,
    "r2": 0.9445401677974787
  },
  "HLM": {
    "mean_absolute_error": 0.13853918386000208,
    "r2": 0.922922121334044
  },
  "LogD": {
    "mean_absolute_error": 0.13979969267042414,
    "r2": 0.9809356262718117
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.12302954078739389,
    "macro_r2": 0.9363325250062843
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.4357920451919993,
    "r2": 0.2025251509514966
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2650045787792353,
    "r2": 0.14335606208567642
  },
  "MLM": {
    "mean_absolute_error": 0.5269354235940544,
    "r2": 0.020800870876167665
  },
  "HLM": {
    "mean_absolute_error": 0.5451256776020096,
    "r2": -0.24332393318536716
  }

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (3) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.14152796655820626,
    "r2": 0.9371535655117621
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.09380656610819096,
    "r2": 0.888514819396159
  },
  "MLM": {
    "mean_absolute_error": 0.18963686476001324,
    "r2": 0.8670893259528928
  },
  "HLM": {
    "mean_absolute_error": 0.15079814920057377,
    "r2": 0.9056831186521972
  },
  "LogD": {
    "mean_absolute_error": 0.21696616546975245,
    "r2": 0.9610252910927546
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.15854714241934734,
    "macro_r2": 0.9118932241211531
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.49318758265430235,
    "r2": 0.0911451892790337
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2699714709493397,
    "r2": 0.2192983815940759
  },
  "MLM": {
    "mean_absolute_error": 0.45964326788790866,
    "r2": -0.09534373750856018
  },
  "HLM": {
    "mean_absolute_error": 0.403706195131317,
    "r2": -0.03093494549489484
  }

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (3) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.1125530165610916,
    "r2": 0.9562336711686011
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.0848403883746084,
    "r2": 0.90891759236915
  },
  "MLM": {
    "mean_absolute_error": 0.0933274110996453,
    "r2": 0.9656599027784813
  },
  "HLM": {
    "mean_absolute_error": 0.10275760771433629,
    "r2": 0.951066384191527
  },
  "LogD": {
    "mean_absolute_error": 0.11379170615709464,
    "r2": 0.9873025748477414
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.10145402598135525,
    "macro_r2": 0.9538360250711001
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.4858513129826885,
    "r2": -0.1264214246580686
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.23480826473811126,
    "r2": 0.22987918666071872
  },
  "MLM": {
    "mean_absolute_error": 0.4350341699256539,
    "r2": 0.11532076174268113
  },
  "HLM": {
    "mean_absolute_error": 0.49753429991076276,
    "r2": -0.058802434348966814
  },


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (3) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.11477006558459803,
    "r2": 0.9576309482033944
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.0875738935878281,
    "r2": 0.9043522224232208
  },
  "MLM": {
    "mean_absolute_error": 0.1051815400347513,
    "r2": 0.9575025534695657
  },
  "HLM": {
    "mean_absolute_error": 0.09288331901201524,
    "r2": 0.9642531960825176
  },
  "LogD": {
    "mean_absolute_error": 0.14471215070102567,
    "r2": 0.9780445088373652
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.10902419378404367,
    "macro_r2": 0.9523566858032128
  }
}

Val metrics:
{
  "KSOL": {
    "mean_absolute_error": 0.4783119779500371,
    "r2": 0.1757673292914611
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.25400166066229396,
    "r2": 0.3244032566585551
  },
  "MLM": {
    "mean_absolute_error": 0.4151491230803711,
    "r2": 0.1434760611657272
  },
  "HLM": {
    "mean_absolute_error": 0.35107380090087575,
    "r2": 0.13784652405377185
  },
 

0,1
split_0-epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████
split_0-train_loss_epoch,█▅▄▄▄▃▃▃▂▂▂▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,█▄▂▂▂▁▁▁▁▁▁▁
split_0-val/mae,█▃▄▃▂▂▃▁▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▂▂▁▆▅▇▆▇▆▆▇▇█▇██████████████████████████
split_0-val_loss,█▃▂▂▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
split_1-train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-train_loss_step,█▅▃▂▂▂▁▁▂▂▁▁
split_1-val/mae,█▄▃▃▃▂▂▂▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.04666
split_0-train_loss_step,0.04502
split_0-val/mae,0.48808
split_0-val/r2,0.59129
split_0-val_loss,0.38015
split_1-epoch,199.0
split_1-train_loss_epoch,0.04955
split_1-train_loss_step,0.04722
split_1-val/mae,0.4988


## Cleaning up unstable predictions for stero impure + run 1

In [8]:
input_df = pd.read_csv("../data/asap/datasets/rnd_splits/split_0.csv")
input_df

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,,376.0,1.0,,8.5,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1 |&1:3|,300,|&1:3|,,,2.576341,0.977724,train
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,,375.0,-0.3,,0.9,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1 |&1:11|,249,|&1:11|,,,2.575188,0.278754,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,CC(C)NC[C@H](O)COC1=CC=CC2=CC=CC=C12 |&1:5|,25.5,,,63.0,,CC(C)NC[C@H](O)COc1cccc2ccccc12,CC(C)NC[C@H](O)COc1cccc2ccccc12 |&1:5|,22,|&1:5|,1.423246,1.806180,,,val
400,O=C(O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl,216.0,,,386.0,,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,380,,2.336460,2.587711,,,val
401,NCC1=CC(Cl)=CC(C(=O)NC2=CC=C3CNCC3=C2)=C1,,,2.0,,,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,303,,,,,,train
402,COC(=O)NC1=NC2=CC=C(C(=O)C3=CC=CC=C3)C=C2N1,,,2.9,,,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,166,,,,,,train


In [9]:
input_df["mol_idx"].nunique()

404

In [10]:
from rdkit import Chem
from rdkit.Chem import EnumerateStereoisomers
opts = EnumerateStereoisomers.StereoEnumerationOptions()
opts.onlyStereoGroups = True

In [11]:
def get_all_isomers(smiles: str) -> list[str]:
    mol = Chem.MolFromSmiles(smiles)
    enumerated = EnumerateStereoisomers.EnumerateStereoisomers(mol, opts)

    return [Chem.MolToSmiles(isomer) for isomer in enumerated]

def calculate_num_isomers(input_df: pd.DataFrame):
    input_df = input_df.copy()
    input_df["n_isomers"] = input_df["cxsmiles_std"].apply(lambda x: len(get_all_isomers(x)))
    
    return input_df

In [12]:
input_df = calculate_num_isomers(input_df)
input_df

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split,n_isomers
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val,1
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train,2
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train,2
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,,376.0,1.0,,8.5,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1,NC(=O)[C@H]1CCCN(C(=O)Cc2cccc3occc23)C1 |&1:3|,300,|&1:3|,,,2.576341,0.977724,train,2
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,,375.0,-0.3,,0.9,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1,Cc1cc(CC(=O)N2CCC[C@H](C(N)=O)C2)ccn1 |&1:11|,249,|&1:11|,,,2.575188,0.278754,train,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,CC(C)NC[C@H](O)COC1=CC=CC2=CC=CC=C12 |&1:5|,25.5,,,63.0,,CC(C)NC[C@H](O)COc1cccc2ccccc12,CC(C)NC[C@H](O)COc1cccc2ccccc12 |&1:5|,22,|&1:5|,1.423246,1.806180,,,val,2
400,O=C(O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl,216.0,,,386.0,,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,380,,2.336460,2.587711,,,val,1
401,NCC1=CC(Cl)=CC(C(=O)NC2=CC=C3CNCC3=C2)=C1,,,2.0,,,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,303,,,,,,train,1
402,COC(=O)NC1=NC2=CC=C(C(=O)C3=CC=CC=C3)C=C2N1,,,2.9,,,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,166,,,,,,train,1


In [13]:
input_df[(input_df["n_isomers"] > 1) & (input_df["smiles_ext"].isna())]

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split,n_isomers


In [14]:
tmp = input_df[~input_df["smiles_ext"].isna()]
tmp[(tmp["n_isomers"] > 1) & (tmp["smiles_ext"].str.contains("a"))]

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split,n_isomers


In [15]:
tmp["n_isomers"].value_counts()

n_isomers
2    170
1     20
4      9
Name: count, dtype: int64

In [16]:
tmp[tmp["smiles_ext"].str.contains("a")]

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split,n_isomers
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val,1
51,O=C(CC1=CN=CC2=CC=CC=C12)N1CC[C@@H](OC2=CC=C(F...,,324.0,,,21.7,O=C(Cc1cncc2ccccc12)N1CC[C@@H](Oc2ccc(F)cc2)C1,O=C(Cc1cncc2ccccc12)N1CC[C@@H](Oc2ccc(F)cc2)C1...,326,|a:16|,,,2.511883,1.356026,train,1
175,CC1=CC(C2=NOC(C(F)(F)F)=N2)=CC=C1OCCCC1=CC(C(=...,,12.0,4.4,13.0,2.09,Cc1cc(-c2noc(C(F)(F)F)n2)ccc1OCCCc1cc(C(=O)N2C...,Cc1cc(-c2noc(C(F)(F)F)n2)ccc1OCCCc1cc(C(=O)N2C...,231,|a:28|,,1.146128,1.113943,0.489958,train,1
193,CC1=CC(C2=NOC(C(F)(F)F)=N2)=CC=C1OCCCC1=CC(C(=...,,13.0,4.4,12.3,2.47,Cc1cc(-c2noc(C(F)(F)F)n2)ccc1OCCCc1cc(C(=O)N2C...,Cc1cc(-c2noc(C(F)(F)F)n2)ccc1OCCCc1cc(C(=O)N2C...,232,|a:28|,,1.123852,1.146128,0.540329,train,1
195,O=C1NC(=O)[C@@H](CC2=CC=C3OCOC3=C2)C2=CC=CC=C1...,32.4,263.0,2.4,82.7,6.56,O=C1NC(=O)[C@@H](Cc2ccc3c(c2)OCO3)c2ccccc21,O=C1NC(=O)[C@@H](Cc2ccc3c(c2)OCO3)c2ccccc21 |a:5|,391,|a:5|,1.523746,1.922725,2.421604,0.878522,val,1
225,NCC1=CC=CC(NC(=O)[C@@H](NC(=O)OCC2=CC=CC=C2)C2...,11.0,23.0,3.2,1.0,1.53,NCc1cccc(NC(=O)[C@@H](NC(=O)OCc2ccccc2)c2ccc(O...,NCc1cccc(NC(=O)[C@@H](NC(=O)OCc2ccccc2)c2ccc(O...,308,|a:10|,1.079181,0.30103,1.380211,0.403121,train,1
241,C[C@H]1CN(C2=CN=CC3=CC=CC=C23)C(=O)[C@@]12CN(C...,19.0,,1.2,57.0,1.0,C[C@H]1CN(c2cncc3ccccc23)C(=O)[C@@]12CN(Cc1ccn...,C[C@H]1CN(c2cncc3ccccc23)C(=O)[C@@]12CN(Cc1ccn...,214,"|a:1,16|",1.30103,1.763428,,0.30103,train,1
242,CNC(=O)CN1C[C@@]2(C(=O)N(C3=CN=CC4=CC=CC=C34)C...,70.0,390.0,2.1,75.0,0.5,CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H...,CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H...,115,"|a:7,22|",1.851258,1.880814,2.592177,0.176091,train,1
243,CNC(=O)CN1C[C@@]2(C(=O)N(C3=CN=CC4=CC=CC=C34)C...,39.0,397.0,1.8,77.0,0.6,CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H...,CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H...,120,"|a:7,22|",1.60206,1.892095,2.599883,0.20412,train,1
244,C[C@H]1CN(C2=CN=CC3=CC=CC=C23)C(=O)[C@@]12CN(C...,237.0,376.0,1.8,237.0,1.33,C[C@H]1CN(c2cncc3ccccc23)C(=O)[C@@]12CN(CCN1CC...,C[C@H]1CN(c2cncc3ccccc23)C(=O)[C@@]12CN(CCN1CC...,206,"|a:1,16|",2.376577,2.376577,2.576341,0.367356,train,1


In [17]:
def enumerate_isomers(input_df: pd.DataFrame):
    input_df = input_df.copy()
    # we don't want to find isomers for compounds with absolute stereochemistry
    # so exlude them and just leave the original compound as the only isomer
    input_df["cxsmiles_std_isomer"] = input_df["cxsmiles_std"]
    input_df.loc[input_df["smiles_ext"].isna(), "smiles_ext"] = ""
    stereo_impure_mask = ~input_df["smiles_ext"].isna() & ~input_df["smiles_ext"].str.contains("a")
    input_df.loc[stereo_impure_mask, "cxsmiles_std_isomer"] = \
        input_df.loc[stereo_impure_mask, "cxsmiles_std"].apply(lambda x: get_all_isomers(x))
    
    input_df = input_df.explode("cxsmiles_std_isomer")
    
    return input_df

In [18]:
enumerate_isomers(input_df)

Unnamed: 0,smiles,HLM,KSOL,LogD,MLM,MDR1-MDCKII,smiles_std,cxsmiles_std,mol_idx,smiles_ext,LogHLM,LogMLM,LogKSOL,LogMDR1-MDCKII,split,n_isomers,cxsmiles_std_isomer
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,,2.0,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|,191,|a:16|,,,,0.477121,val,1,COc1cccc(Cl)c1NC(=O)N1CCC[C@H](C(N)=O)C1 |a:16|
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train,2,O=C(NCC(F)F)[C@@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)...
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,,0.2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...,335,|&1:7|,,,2.523746,0.079181,train,2,O=C(NCC(F)F)[C@H](Nc1cc2c(cc1Br)CNC2)c1cc(Cl)c...
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train,2,O=C(NCC(F)F)[C@@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc...
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,,0.5,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...,336,|&1:7|,,,,0.176091,train,2,O=C(NCC(F)F)[C@H](Nc1ccc2c(c1)CNC2)c1cc(Br)cc2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,O=C(O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl,216.0,,,386.0,,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,380,,2.33646,2.587711,,,val,1,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl
401,NCC1=CC(Cl)=CC(C(=O)NC2=CC=C3CNCC3=C2)=C1,,,2.0,,,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1,303,,,,,,train,1,NCc1cc(Cl)cc(C(=O)Nc2ccc3c(c2)CNC3)c1
402,COC(=O)NC1=NC2=CC=C(C(=O)C3=CC=CC=C3)C=C2N1,,,2.9,,,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1,166,,,,,,train,1,COC(=O)Nc1nc2ccc(C(=O)c3ccccc3)cc2[nH]1
403,CC1=NC=CN1C[C@H]1CCC2=C(C1=O)C1=CC=CC=C1N2C |&...,,127.0,,,,Cc1nccn1C[C@H]1CCc2c(c3ccccc3n2C)C1=O,Cc1nccn1C[C@H]1CCc2c(c3ccccc3n2C)C1=O |&1:7|,268,|&1:7|,,,2.107210,,train,2,Cc1nccn1C[C@@H]1CCc2c(c3ccccc3n2C)C1=O


In [19]:
PROPERTY2STD = {
    "LogHLM": 217.0507262281013,
    "LogMLM": 318.4791749144294,
    "LogD": 1.2280921175245259,
    "LogKSOL": 157.6780176767439,
    "LogMDR1-MDCKII": 6.433744322214307
}

def clean_data(input_paths, output_dir):
    mols_to_remove = defaultdict(set)

    for input_path in input_paths:
        print(f"Training and predicting on {input_path}")
        input_df = pd.read_csv(input_path)
        input_df["cxsmiles_std_isomer"] = input_df["cxsmiles_std"]
        input_df = pd.concat([
            input_df[input_df["split"] == "train"],
            enumerate_isomers(input_df[input_df["split"] == "val"])
        ]).reset_index(drop=True)
        train_dset, val_dset, pred_dset = prepare_data(input_df, smiles_column_for_pred="cxsmiles_std_isomer")

        save_dir = output_dir / f"{input_path.stem}"
        model = train_model(MODEL_CONFIG, train_dset, val_dset, NUM_WORKERS, save_dir, run_idx=None, enable_logger=False)
        preds = predict(model, pred_dset, NUM_WORKERS)

        output_df = input_df.copy()
        output_df[["pred_" + t for t in TARGET_COLUMNS]] = preds
        save_dir.mkdir(parents=True, exist_ok=True)
        output_df.to_csv(save_dir / "predictions.csv", index=False)

        output_val_df = output_df[input_df["split"] == "val"].reset_index(drop=True)
        for t in TARGET_COLUMNS:
            if t in ["LogHLM", "LogMLM", "LogKSOL", "LogMDR1-MDCKII"]:
                output_val_df[f"diff_{t}"] = output_val_df.groupby("mol_idx")[f"pred_{t}"].transform(
                    lambda x: np.power(10, x.max()) - np.power(10, x.min())
                )
            else:
                output_val_df[f"diff_{t}"] = output_val_df.groupby("mol_idx")[f"pred_{t}"].transform(
                    lambda x: x.max() - x.min()
                )
            if t == "LogHLM":
                output_val_df[f"noisy_{t}"] = output_val_df[f"diff_{t}"] > 0.2 * PROPERTY2STD[t]
            elif t == "LogMLM":
                output_val_df[f"noisy_{t}"] = output_val_df[f"diff_{t}"] > 0.2 * PROPERTY2STD[t]
            elif t == "LogD":
                output_val_df[f"noisy_{t}"] = output_val_df[f"diff_{t}"] > 0.2  # absolute error
            elif t == "LogKSOL":
                output_val_df[f"noisy_{t}"] = output_val_df[f"diff_{t}"] > 0.2 * PROPERTY2STD[t]
            elif t == "LogMDR1-MDCKII":
                output_val_df[f"noisy_{t}"] = output_val_df[f"diff_{t}"] > 0.2 * PROPERTY2STD[t]
            
            mols_to_remove[t].update(
                output_val_df[output_val_df[f"noisy_{t}"]]["mol_idx"].tolist()
            )

    for t in TARGET_COLUMNS:
        print(f"[{input_path.stem}] Removing {len(mols_to_remove[t])} noisy mols out of each trainig set for {t}")
    
    for input_path in input_paths:
        input_df = pd.read_csv(input_path)
        for t in TARGET_COLUMNS:
            input_df.loc[
                input_df["mol_idx"].isin(mols_to_remove[t]) & \
                    (input_df["split"] == "train"),
                t
            ] = np.nan

        input_df.to_csv(output_dir / input_path.name, index=False)

In [20]:
input_paths = [Path(f'../data/asap/datasets/rnd_splits/split_{k}.csv') for k in range(5)]
output_dir = Path("../output/asap/rnd_splits/chemprop/run_0/cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

In [21]:
clean_data(input_paths, output_dir)

Training and predicting on ../data/asap/datasets/rnd_splits/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` a

Training and predicting on ../data/asap/datasets/rnd_splits/split_1.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | trai

Training and predicting on ../data/asap/datasets/rnd_splits/split_2.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | trai

Training and predicting on ../data/asap/datasets/rnd_splits/split_3.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | trai

Training and predicting on ../data/asap/datasets/rnd_splits/split_4.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | trai

[split_4] Removing 22 noisy mols out of each trainig set for LogHLM
[split_4] Removing 18 noisy mols out of each trainig set for LogMLM
[split_4] Removing 37 noisy mols out of each trainig set for LogD
[split_4] Removing 101 noisy mols out of each trainig set for LogKSOL
[split_4] Removing 23 noisy mols out of each trainig set for LogMDR1-MDCKII


In [22]:
pd.read_csv(input_paths[0])[TARGET_COLUMNS].isna().sum()

LogHLM            150
LogMLM            140
LogD               86
LogKSOL            74
LogMDR1-MDCKII     15
dtype: int64

In [23]:
pd.read_csv(output_dir / input_paths[0].name)[TARGET_COLUMNS].isna().sum()

LogHLM            155
LogMLM            143
LogD              101
LogKSOL           135
LogMDR1-MDCKII     37
dtype: int64

In [24]:
input_paths = [Path(f'../output/asap/rnd_splits/chemprop/run_0/cleaned/split_{k}.csv') for k in range(5)]
save_dirs = [Path(f'../output/asap/rnd_splits/chemprop/run_1/split_{k}') for k in range(5)]
RUN_IDX = "1_clean_noisy_stereo_impure"

In [25]:
train_and_eval(input_paths, save_dirs, RUN_IDX)

Training and predicting on ../output/asap/rnd_splits/chemprop/run_0/cleaned/split_0.csv


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvladvin111[0m ([33mvladvin-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 2.2 M  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 2.0 K  | trai

Train metrics:
{
  "LogD": {
    "mean_absolute_error": 0.41157330864470854,
    "r2": 0.7793256730035836
  },
  "MLM": {
    "mean_absolute_error": 0.2274635736890114,
    "r2": 0.7692358810553236
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15861471095255583,
    "r2": 0.7039743137709552
  },
  "KSOL": {
    "mean_absolute_error": 0.30302947491732396,
    "r2": 0.6639689553479851
  },
  "HLM": {
    "mean_absolute_error": 0.22111798235844082,
    "r2": 0.7415076219426853
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.2643598101124081,
    "macro_r2": 0.7316024890241065
  }
}

Val metrics:
{
  "LogD": {
    "mean_absolute_error": 0.61606073803589,
    "r2": 0.6217383236856941
  },
  "MLM": {
    "mean_absolute_error": 0.3510808179304693,
    "r2": 0.48925999458340796
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.2548134638389187,
    "r2": 0.38469622915485413
  },
  "KSOL": {
    "mean_absolute_error": 0.519244693145987,
    "r2": 0.08830698591379915
  },
  "

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "LogD": {
    "mean_absolute_error": 0.2630360685667858,
    "r2": 0.9162161709305012
  },
  "MLM": {
    "mean_absolute_error": 0.2071444701859941,
    "r2": 0.8206599253308432
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.15387229231635954,
    "r2": 0.7103173968846648
  },
  "KSOL": {
    "mean_absolute_error": 0.27647263583003473,
    "r2": 0.6947764461448322
  },
  "HLM": {
    "mean_absolute_error": 0.1892485309606936,
    "r2": 0.805741035686721
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.21795479957197356,
    "macro_r2": 0.7895421949955124
  }
}

Val metrics:
{
  "LogD": {
    "mean_absolute_error": 0.5447231742739678,
    "r2": 0.6625823624015863
  },
  "MLM": {
    "mean_absolute_error": 0.3500520323848197,
    "r2": 0.5204176868102546
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.20249838406466078,
    "r2": 0.32469892370304343
  },
  "KSOL": {
    "mean_absolute_error": 0.43212247833779316,
    "r2": 0.26626116679035916
  },
 

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "LogD": {
    "mean_absolute_error": 0.24629208330002403,
    "r2": 0.9213291204314001
  },
  "MLM": {
    "mean_absolute_error": 0.2226628849942504,
    "r2": 0.7921801794411683
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1594922086738758,
    "r2": 0.6966986002336822
  },
  "KSOL": {
    "mean_absolute_error": 0.26707413667271906,
    "r2": 0.7085725358388657
  },
  "HLM": {
    "mean_absolute_error": 0.20348018369443777,
    "r2": 0.7789510627362493
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.21980029946706142,
    "macro_r2": 0.7795462997362732
  }
}

Val metrics:
{
  "LogD": {
    "mean_absolute_error": 0.5609929378962878,
    "r2": 0.670569639483211
  },
  "MLM": {
    "mean_absolute_error": 0.4419827322587787,
    "r2": 0.05836532509549319
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1941678700516716,
    "r2": 0.5386003487941138
  },
  "KSOL": {
    "mean_absolute_error": 0.4325733212981746,
    "r2": 0.28242950451161664
  },
  

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "LogD": {
    "mean_absolute_error": 0.33838355030188577,
    "r2": 0.8864517752152922
  },
  "MLM": {
    "mean_absolute_error": 0.2367176323636466,
    "r2": 0.7737339550332083
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1537083369722694,
    "r2": 0.7443691982824767
  },
  "KSOL": {
    "mean_absolute_error": 0.2864583648744494,
    "r2": 0.7008398852032374
  },
  "HLM": {
    "mean_absolute_error": 0.21636523087618673,
    "r2": 0.752404458574051
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.24632662307768757,
    "macro_r2": 0.7715598544616531
  }
}

Val metrics:
{
  "LogD": {
    "mean_absolute_error": 0.5168429049299313,
    "r2": 0.6808673873374208
  },
  "MLM": {
    "mean_absolute_error": 0.39349699742893035,
    "r2": 0.2770419730072251
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.22926762735587358,
    "r2": 0.28365381248197274
  },
  "KSOL": {
    "mean_absolute_error": 0.3908191409532371,
    "r2": 0.39676522672831727
  },
 

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/opt/conda/envs/admet/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (6) is smaller than the logging interval Trainer(log_every_

Train metrics:
{
  "LogD": {
    "mean_absolute_error": 0.3109755268944389,
    "r2": 0.9034395116859321
  },
  "MLM": {
    "mean_absolute_error": 0.19381306028427364,
    "r2": 0.8542786751941044
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1429759592707131,
    "r2": 0.7493253676984301
  },
  "KSOL": {
    "mean_absolute_error": 0.23492245945313914,
    "r2": 0.7530365422636985
  },
  "HLM": {
    "mean_absolute_error": 0.2132007834081363,
    "r2": 0.782922044336043
  },
  "aggregated": {
    "macro_mean_absolute_error": 0.21917755786214022,
    "macro_r2": 0.8086004282356416
  }
}

Val metrics:
{
  "LogD": {
    "mean_absolute_error": 0.521813886816303,
    "r2": 0.6654304881392514
  },
  "MLM": {
    "mean_absolute_error": 0.33753299478560805,
    "r2": 0.43068289468435184
  },
  "MDR1-MDCKII": {
    "mean_absolute_error": 0.1839600434547671,
    "r2": 0.6132509883864026
  },
  "KSOL": {
    "mean_absolute_error": 0.3648529207485866,
    "r2": 0.23961803119276104
  },
  "

0,1
split_0-epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
split_0-train_loss_epoch,█▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▃▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁
split_0-train_loss_step,▆▄█▄▃▅▃▃▁▃▂▅▃▂▅▂▂▂▂▂▄▂▂▄
split_0-val/mae,▆▄▄▃▂▃▄█▃▂▃▆▄▃▂▂▁▁▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_0-val/r2,▄▆▆▆▇▄▆▁▇███▇███████████████████████████
split_0-val_loss,▅▄▃█▅▆▆▃▇▂▄▂▃▂▂▂▂▂▂▄▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
split_1-epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇█
split_1-train_loss_epoch,█▇▇▆▇▇▇▇▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▃▃▂▂▂▂▂▁▂▂▁▁▂▁▁▁▁
split_1-train_loss_step,▇▄▄▆▃▃▂▃▅▂▂▁▂▂█▂▁▃▁▂▃▁▁▂
split_1-val/mae,▄▇█▄▇▂▄▄▂▂▂▂▁▁▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
split_0-epoch,199.0
split_0-train_loss_epoch,0.13923
split_0-train_loss_step,0.37768
split_0-val/mae,0.4097
split_0-val/r2,0.68362
split_0-val_loss,0.29427
split_1-epoch,199.0
split_1-train_loss_epoch,0.10067
split_1-train_loss_step,0.14334
split_1-val/mae,0.36942
