### Try with AugLiChem

In [1]:
import sys
from tqdm import tqdm
sys.path.append(sys.path[0][:-8])

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(8)

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

from rdkit import Chem

In [2]:
from auglichem.molecule.data import MoleculeDatasetWrapper
from auglichem.molecule import RandomAtomMask, RandomBondDelete, Compose
from auglichem.molecule.models import AttentiveFP

from auglichem.molecule.models.AttentiveLayers import Fingerprint
from auglichem.molecule.models.getFeatures import save_smiles_dicts, get_smiles_array

from auglichem.molecule.models import AttentiveFP as AFP

In [3]:
task_name = 'ClinTox'
tasks = ['CT_TOX']

random_seed = 88

batch_size = 100
epochs = 800
p_dropout = 0.5
fingerprint_dim = 200

radius = 3
T = 3
weight_decay = 3 # also known as l2_regularization_lambda
learning_rate = 3.5
per_task_output_units_num = 2 # for classification model with 2 classes
output_units_num = len(tasks) * per_task_output_units_num

In [4]:
transform = Compose([
    RandomAtomMask(1.),
    RandomBondDelete([0., 0.2])
])
dataset = MoleculeDatasetWrapper('BACE', transform=transform, aug_time=1, batch_size=batch_size)
print(dataset.batch_size)

Using: ./data_download/bace.csv
DATASET: BACE


1513it [00:00, 5144.63it/s]

100





### Torch Geometric AFP Implementation

In [5]:
train_loader, val_loader, test_loader = dataset.get_data_loaders()

1512
About to generate scaffolds
Generating scaffold 0/1512
Generating scaffold 1000/1512
About to sort in scaffold sets


In [6]:
model = AFP(num_layers=radius, num_timesteps=T, dropout=p_dropout, hidden_channels=fingerprint_dim, out_channels=output_units_num, edge_dim=2, in_channels=2)
#their_model = model = Fingerprint(radius, T, 2, 2,
            #fingerprint_dim, output_units_num, p_dropout)

In [7]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)
for idx, (name, param) in enumerate(model.named_parameters()):
    if param.requires_grad:
        print(idx, name, param.data.shape)
#print(model)

1168602
0 lin1.weight torch.Size([200, 2])
1 lin1.bias torch.Size([200])
2 atom_convs.0.att_l torch.Size([1, 200])
3 atom_convs.0.att_r torch.Size([1, 200])
4 atom_convs.0.bias torch.Size([200])
5 atom_convs.0.lin1.weight torch.Size([200, 202])
6 atom_convs.0.lin2.weight torch.Size([200, 200])
7 atom_convs.1.att_l torch.Size([1, 1, 200])
8 atom_convs.1.att_r torch.Size([1, 1, 200])
9 atom_convs.1.bias torch.Size([200])
10 atom_convs.1.lin_l.weight torch.Size([200, 200])
11 atom_convs.2.att_l torch.Size([1, 1, 200])
12 atom_convs.2.att_r torch.Size([1, 1, 200])
13 atom_convs.2.bias torch.Size([200])
14 atom_convs.2.lin_l.weight torch.Size([200, 200])
15 atom_grus.0.weight_ih torch.Size([600, 200])
16 atom_grus.0.weight_hh torch.Size([600, 200])
17 atom_grus.0.bias_ih torch.Size([600])
18 atom_grus.0.bias_hh torch.Size([600])
19 atom_grus.1.weight_ih torch.Size([600, 200])
20 atom_grus.1.weight_hh torch.Size([600, 200])
21 atom_grus.1.bias_ih torch.Size([600])
22 atom_grus.1.bias_hh torc

In [8]:
def evaluate(model, test_loader):
    with torch.no_grad():
        model.eval()
        data = next(iter(test_loader))
        
        # Get data
        x = data.x
        edge_index = data.edge_index
        edge_attr = data.edge_attr
        
        # Predict
        pred = model(x.float(), edge_index, edge_attr, data.batch)
        
        loss = criterion(pred, data.y.flatten())

    score = roc_auc_score(data.y, pred.detach()[:,1])
    print("TEST LOSS: {0:.3f}, ROC-AUC: {1:.3f}".format(loss.detach(), score))
    
def validate(model, val_loader):
    with torch.no_grad():
        model.eval()
        data = next(iter(val_loader))
        
        # Get data
        x = data.x
        edge_index = data.edge_index
        edge_attr = data.edge_attr
        
        # Predict
        pred = model(x.float(), edge_index, edge_attr, data.batch)
        
        loss = criterion(pred, data.y.flatten())

    score = roc_auc_score(data.y, pred.detach()[:,1])
    print("VALIDATION LOSS: {0:.3f}, ROC-AUC: {1:.3f}".format(loss.detach(), score))

In [9]:
optimizer = optim.Adam(model.parameters(), 10**-learning_rate, weight_decay=10**-weight_decay)
criterion = nn.CrossEntropyLoss()

In [11]:
for epoch in range(epochs):
    for bn, data in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()

        # Get data
        x = data.x
        edge_index = data.edge_index
        edge_attr = data.edge_attr
        
        # Predict
        pred = model(x.float(), edge_index, edge_attr, data.batch)
        
        # Update
        loss = criterion(pred, data.y.flatten())
        loss.backward()
        optimizer.step()
    print("EPOCH:\t{}".format(epoch))
    validate(model, val_loader)
    evaluate(model, test_loader)

24it [00:17,  1.38it/s]


EPOCH:	0
VALIDATION LOSS: 0.687, ROC-AUC: 0.605
TEST LOSS: 0.628, ROC-AUC: 0.756


24it [00:17,  1.41it/s]


EPOCH:	1
VALIDATION LOSS: 0.765, ROC-AUC: 0.608
TEST LOSS: 0.763, ROC-AUC: 0.747


24it [00:15,  1.52it/s]


EPOCH:	2
VALIDATION LOSS: 0.729, ROC-AUC: 0.606
TEST LOSS: 0.710, ROC-AUC: 0.756


24it [00:15,  1.52it/s]


EPOCH:	3
VALIDATION LOSS: 0.686, ROC-AUC: 0.602
TEST LOSS: 0.633, ROC-AUC: 0.761


24it [00:15,  1.55it/s]


EPOCH:	4
VALIDATION LOSS: 0.693, ROC-AUC: 0.602
TEST LOSS: 0.638, ROC-AUC: 0.762


24it [00:15,  1.54it/s]


EPOCH:	5
VALIDATION LOSS: 0.694, ROC-AUC: 0.605
TEST LOSS: 0.645, ROC-AUC: 0.761


24it [00:15,  1.57it/s]


EPOCH:	6
VALIDATION LOSS: 0.718, ROC-AUC: 0.607
TEST LOSS: 0.710, ROC-AUC: 0.754


24it [00:16,  1.49it/s]


EPOCH:	7
VALIDATION LOSS: 0.726, ROC-AUC: 0.605
TEST LOSS: 0.706, ROC-AUC: 0.760


22it [00:15,  1.39it/s]


KeyboardInterrupt: 