In [1]:
from ToxicMl.trainer import GenericClassificationTrainer
from ToxicMl.metrics import Accuracy, F1, Recall, Precision
from ToxicMl.dataset import HivDataset
from torch_geometric.loader import ImbalancedSampler


from pathlib import Path

EPOCHS = 8

root = Path("dataset/ogbg_molhiv_custom")
dataset = HivDataset(root)
split_idx = dataset.get_idx_split() 
train = dataset[split_idx["train"]]
validation = dataset[split_idx["valid"]]
test = dataset[split_idx["test"]]

train_metrics = [F1(), Accuracy(), Recall(), Precision()]
validation_metrics = [F1(), Accuracy(), Recall(), Precision()]

In [2]:
from ToxicMl.MLmodels.attention import ChemAttentionBlock
import torch
import torch.nn.functional as F

from torch.nn import Linear, ReLU
from torch_geometric.data import Data
from torch_geometric.nn.pool import global_max_pool

from torch.nn import BatchNorm1d

class ChemAttentionDescriptors(torch.nn.Module):
    def __init__(self, embedder_depth, in_channels, hidden_channels, out_channels, in_descriptors):
        super().__init__()
        self.in_descriptors = in_descriptors
        self.encoder = torch.nn.ModuleList([
            ChemAttentionBlock(in_channels=in_channels, out_channels=hidden_channels)
            if i == 0
            else ChemAttentionBlock(in_channels=hidden_channels, out_channels=hidden_channels)
            for i in range(embedder_depth)])
        self.norm1 = BatchNorm1d(hidden_channels+in_descriptors)
        self.linear1 = Linear(hidden_channels+in_descriptors, 64)
        self.relu1 = ReLU(inplace=True)
        self.linear2 = Linear(64, 32)
        self.relu2 = ReLU(inplace=True)
        self.linear3 = Linear(32, out_channels)
        self.relu3 = ReLU(inplace=True)
        

    def forward(self, data: Data):
        for module in self.encoder:
            data = module(data)
        x = global_max_pool(data.x, data.batch)


        d = data.descriptors.view(-1, self.in_descriptors)
        x = torch.cat([x,d], dim=1)
        x = self.norm1(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.relu3(x)
        return F.softmax(x)
    

In [3]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(3, 133, 16, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 3-16, sampling, custom dataset, descriptors")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcus-tibor[0m ([33mcus-tibor-none[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:09<00:00, 106.75it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 116.10it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:09<00:00, 103.99it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 213.94it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:10<00:00, 101.41it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 203.58it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:11<00:00, 89.72it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 182.57it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:11<00:00, 86.15it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 170.45it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 73.64it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 159.50it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 78.20it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 153.80it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:12<00:00, 83

0,1
train/Accuracy,▁▇▇██▇██
train/F1,▁▇▇▇█▇██
train/Precision,▁▄▆▇▇▆▇█
train/Recall,▁█▇██▇██
train/loss,▆▁▂▄▄▆█▇
validation/Accuracy,▂▇█▅▆▄▆▁
validation/F1,▂▇█▅▆▄▆▁
validation/Precision,▁██▁▁█▁▁
validation/Recall,▂▇█▅▆▄▆▁
validation/loss,▅▄█▃▁▃▆▃

0,1
train/Accuracy,0.789
train/F1,0.7707
train/Precision,0.70959
train/Recall,0.84333
train/loss,0.77187
validation/Accuracy,0.8337
validation/F1,0.11628
validation/Precision,0.55556
validation/Recall,0.06494
validation/loss,0.73039


In [4]:
model = ChemAttentionDescriptors(3, 133, 32, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 3-32, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:11<00:00, 89.94it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 167.24it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 76.41it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 148.07it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 77.29it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 171.98it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:12<00:00, 79.89it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 151.33it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:12<00:00, 81.18it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 137.54it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 73.09it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 132.21it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 76.97it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 149.59it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 77.45

0,1
train/Accuracy,▁▇▇▇████
train/F1,▁▇▇▇████
train/Precision,▁▇▇▇██▇█
train/Recall,▁▇█████▇
train/loss,▅▄▂▃▂█▅▁
validation/Accuracy,▂█▁▄▆▃▃▂
validation/F1,▄█▁▇▇▃▃▃
validation/Precision,█▁▅█▅▃▅▆
validation/Recall,▄█▁▆▇▃▃▃
validation/loss,▅▁▅▂█▂▅▁

0,1
train/Accuracy,0.8114
train/F1,0.80064
train/Precision,0.76171
train/Recall,0.84377
train/loss,0.31576
validation/Accuracy,0.8468
validation/F1,0.14634
validation/Precision,0.66667
validation/Recall,0.08219
validation/loss,0.39953


In [5]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(3, 133, 64, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 3-64, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 67.80it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 147.93it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 70.32it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 168.98it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:16<00:00, 62.51it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 135.44it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:16<00:00, 63.70it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 146.50it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 65.53it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 126.31it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 69.16it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 156.58it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:16<00:00, 62.81it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 155.94it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 68.68

0,1
train/Accuracy,▁▇▇█████
train/F1,▁▇▇█████
train/Precision,▁▇██████
train/Recall,▁▇▇█████
train/loss,▁█▆▅▃▂▆▄
validation/Accuracy,▃▅█▇█▇▅▁
validation/F1,▂▄█▃█▆▅▁
validation/Precision,▆▅▅▁▅▅▆█
validation/Recall,▂▄█▄█▆▅▁
validation/loss,▅▁▃▃▄▅█▂

0,1
train/Accuracy,0.8015
train/F1,0.78812
train/Precision,0.74023
train/Recall,0.84264
train/loss,0.61363
validation/Accuracy,0.8014
validation/F1,0.11292
validation/Precision,0.64198
validation/Recall,0.0619
validation/loss,0.43952


In [6]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(5, 133, 16, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 5-16, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:13<00:00, 75.64it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 141.40it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 67.10it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 148.96it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 67.20it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 145.36it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 68.37it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 144.13it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 67.84it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 152.25it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 68.89it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 152.90it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 68.97it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 127.53it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:14<00:00, 68.95

0,1
train/Accuracy,▁▇███▇██
train/F1,▁▇███▇██
train/Precision,▁▆█▇▇▇█▇
train/Recall,▁▆▇▇█▆▇▇
train/loss,▂▃▁▄▃▃█▁
validation/Accuracy,█▁▅▇█▄▅▇
validation/F1,█▁▃▇▇▃▃█
validation/Precision,▃█▁▃▁▅▁▅
validation/Recall,█▁▃▇▇▃▃█
validation/loss,▇▂▆▃█▂▂▁

0,1
train/Accuracy,0.8107
train/F1,0.80118
train/Precision,0.75972
train/Recall,0.84742
train/loss,0.37504
validation/Accuracy,0.8519
validation/F1,0.13617
validation/Precision,0.59259
validation/Recall,0.07692
validation/loss,0.36669


In [7]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(5, 133, 32, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 5-32, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:15<00:00, 66.45it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 137.46it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:16<00:00, 61.06it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 137.47it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:17<00:00, 60.51it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 143.42it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:16<00:00, 62.50it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 136.22it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:17<00:00, 59.62it/s]
val epoch: 100%|██████████| 129/129.0 [00:00<00:00, 129.97it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:19<00:00, 51.71it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 123.46it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:20<00:00, 49.66it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 107.61it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:19<00:00, 52.97

0,1
train/Accuracy,▁▇█▇▇█▇█
train/F1,▁▇█▇▇█▇█
train/Precision,▁▇█▇▇███
train/Recall,▁▇█▇▇▇▇█
train/loss,▂▁█▆▄▄▁▂
validation/Accuracy,▁█▆▆▇▇▇▇
validation/F1,▁█▇▅▇▆▆▇
validation/Precision,█▁▅▂▂▂▁▃
validation/Recall,▁█▇▅▇▆▆▇
validation/loss,█▁▅▄▆█▇█

0,1
train/Accuracy,0.8103
train/F1,0.80045
train/Precision,0.76272
train/Recall,0.8421
train/loss,0.42448
validation/Accuracy,0.8335
validation/F1,0.13181
validation/Precision,0.64198
validation/Recall,0.07345
validation/loss,0.4771


In [8]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(5, 133, 64, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 5-64, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:24<00:00, 42.84it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 110.12it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:22<00:00, 46.13it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 97.62it/s] 
train epoch: 100%|██████████| 1029/1029.0 [00:22<00:00, 45.53it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 105.03it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:22<00:00, 44.86it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 109.31it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:21<00:00, 47.38it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 115.88it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:21<00:00, 47.20it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 106.05it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:23<00:00, 43.99it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 113.24it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:21<00:00, 48.53

0,1
train/Accuracy,▁▇██████
train/F1,▁▇██████
train/Precision,▁▇██████
train/Recall,▁▇██▇█▇█
train/loss,▅▁█▂▄▆▁▄
validation/Accuracy,▁▅▇▃█▁▆▄
validation/F1,▁▄▅▅▇▂█▃
validation/Precision,▄▂▁█▃▅▆▄
validation/Recall,▁▄▆▅█▂█▃
validation/loss,█▃▄▄▄▁▄▅

0,1
train/Accuracy,0.7982
train/F1,0.78597
train/Precision,0.74595
train/Recall,0.83053
train/loss,0.53728
validation/Accuracy,0.8288
validation/F1,0.12
validation/Precision,0.59259
validation/Recall,0.06676
validation/loss,0.47794


In [9]:
from ToxicMl.MLmodels.attention import ChemAttention
model = ChemAttentionDescriptors(5, 133, 128, 2, 202)
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.09)
loss_fn = torch.nn.CrossEntropyLoss()
sampler = ImbalancedSampler(train) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = GenericClassificationTrainer(
    model,
    optimizer,
    loss_fn,
    lr_scheduler,
    train_metrics,
    validation_metrics,
    train,
    validation,
    test,
    device,
    sampler,
    EPOCHS,
    None,
    32
)

trainer.train("HIV ChemAttention 5-128, sampling, custom dataset, descriptors")



  return F.softmax(x)
train epoch: 100%|██████████| 1029/1029.0 [00:28<00:00, 36.23it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 77.20it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:30<00:00, 33.83it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 85.39it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:30<00:00, 34.25it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 88.63it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:30<00:00, 33.43it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 72.29it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:31<00:00, 32.35it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 77.50it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:30<00:00, 34.10it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 69.46it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:30<00:00, 33.96it/s]
val epoch: 100%|██████████| 129/129.0 [00:01<00:00, 85.96it/s]
train epoch: 100%|██████████| 1029/1029.0 [00:29<00:00, 34.98it/s]
v

0,1
train/Accuracy,▁▇█▇████
train/F1,▁▇█▇████
train/Precision,▁▇█▇█▇▇█
train/Recall,▁▇▇▇███▇
train/loss,▇▅▄▁▁█▅█
validation/Accuracy,▂█▄▁▂▅█▅
validation/F1,▁▇▅▆▃▅█▄
validation/Precision,▁▄▄█▃▄▅▃
validation/Recall,▁▇▄▅▃▅█▄
validation/loss,▄▂▄▆▃▂▁█

0,1
train/Accuracy,0.7923
train/F1,0.78059
train/Precision,0.74252
train/Recall,0.82278
train/loss,0.60205
validation/Accuracy,0.8094
validation/F1,0.10706
validation/Precision,0.58025
validation/Recall,0.05897
validation/loss,0.62091
