### Set Path (Won't be needed once `setup.py` is finished)

In [1]:
import sys
sys.path.append(sys.path[0][:-8])

In [2]:
import torch
import os
from tqdm import tqdm

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')                                                                                                                                                       

from rdkit.Chem import Draw
from matplotlib import pyplot as plt

from sklearn.metrics import roc_auc_score as ras
from sklearn.metrics import mean_squared_error as mse

### Auglichem imports

In [3]:
from auglichem.molecule import Compose, RandomAtomMask, RandomBondDelete
from auglichem.molecule.data import MoleculeDatasetWrapper
from auglichem.molecule.models import GCN, AttentiveFP, GINE, DeepGCN

### Set up dataset

In [4]:
help(MoleculeDatasetWrapper)

Help on class MoleculeDatasetWrapper in module auglichem.molecule.data._molecule_dataset:

class MoleculeDatasetWrapper(MoleculeDataset)
 |  MoleculeDatasetWrapper(*args, **kwds)
 |  
 |  Dataset base class for creating graph datasets.
 |  See `here <https://pytorch-geometric.readthedocs.io/en/latest/notes/
 |  create_dataset.html>`__ for the accompanying tutorial.
 |  
 |  Args:
 |      root (string, optional): Root directory where the dataset should be
 |          saved. (optional: :obj:`None`)
 |      transform (callable, optional): A function/transform that takes in an
 |          :obj:`torch_geometric.data.Data` object and returns a transformed
 |          version. The data object will be transformed before every access.
 |          (default: :obj:`None`)
 |      pre_transform (callable, optional): A function/transform that takes in
 |          an :obj:`torch_geometric.data.Data` object and returns a
 |          transformed version. The data object will be transformed before
 |   

In [5]:
help(MoleculeDatasetWrapper.__init__)

Help on function __init__ in module auglichem.molecule.data._molecule_dataset:

__init__(self, dataset, transform=None, split='scaffold', batch_size=64, num_workers=0, valid_size=0.1, test_size=0.1, aug_time=0, data_path=None, target=None, seed=None)
    Input:
    ---
    dataset (str): One of the datasets available from MoleculeNet
                   (http://moleculenet.ai/datasets-1)
    transform (Compose, OneOf, RandomAtomMask, RandomBondDelete object): transormations
                   to apply to the data at call time.
    split (str, optional default=scaffold): random or scaffold. The splitting strategy
                                            used for train/test/validation set creation.
    batch_size (int, optional default=64): Batch size used in training
    num_workers (int, optional default=0): Number of workers used in loading data
    valid_size (float in [0,1], optional default=0.1): 
    test_size (float in [0,1],  optional default=0.1): 
    aug_time (int, optional

In [6]:
help(RandomAtomMask)

Help on class RandomAtomMask in module auglichem.molecule._transforms:

class RandomAtomMask(BaseTransform)
 |  RandomAtomMask(p: float = 1.0)
 |  
 |  Method resolution order:
 |      RandomAtomMask
 |      BaseTransform
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, p: float = 1.0)
 |      @param p: the probability of the transform being applied; default value is 1.0
 |  
 |  apply_transform(self, mol_graph: torch_geometric.data.data.Data, seed: NoneType) -> torch_geometric.data.data.Data
 |      Transform that randomly mask atoms given a certain ratio
 |      @param mol_graph: PyG Data to be augmented
 |      @param seed: 
 |      @returns: Augmented PyG Data
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from BaseTransform:
 |  
 |  __call__(self, mol_graph: torch_geometric.data.data.Data, seed=None) -> torch_geometric.data.data.Data
 |      @param mol_graph: PyG Data to be augmented
 |      @par

In [7]:
# Create transformation
transform = Compose([
    RandomAtomMask([0.1, 0.3]),
    RandomBondDelete([0.1, 0.3])
])

# Initialize dataset object
dataset = MoleculeDatasetWrapper("FreeSolv", data_path="./data_download", transform=transform, batch_size=128)

# Get train/valid/test splits as loaders
train_loader, valid_loader, test_loader = dataset.get_data_loaders()

Using: ./data_download/FreeSolv/SAMPL.csv
DATASET: FreeSolv


642it [00:00, 17711.10it/s]

Generating scaffolds...
Generating scaffold 0/641
About to sort in scaffold sets



  train_loader, valid_loader, test_loader = dataset.get_data_loaders()


### Initialize model with task from data

In [8]:
# Get model
model = GCN(task=dataset.task)

# Uncomment the following line to use GPU
#model.cuda()

### Initialize traning loop

In [9]:
if(dataset.task == 'classification'):
    criterion = torch.nn.CrossEntropyLoss()
elif(dataset.task == 'regression'):
    criterion = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

### Train the model

In [10]:
for epoch in range(2):
    for bn, data in tqdm(enumerate(train_loader)):

        optimizer.zero_grad()
        
        _, pred = model(data)
        
        # data -> GPU
        #_, pred = model(data.cuda())
        
        if(train_loader.dataset.task == "classification"):
            loss = criterion(pred, data.y.flatten())
        if(train_loader.dataset.task == "regression"):
            loss = criterion(pred[:,0], data.y.flatten())

        loss.backward()
        optimizer.step()

4it [00:00,  4.13it/s]
4it [00:00,  4.23it/s]


### Test the model

In [11]:
def evaluate(model, test_loader, validation=False):
    task = test_loader.dataset.task
    set_str = "VALIDATION" if validation else "TEST"
    with torch.no_grad():
        model.eval()
        
        all_preds = []
        all_labels = []
        for data in test_loader:
            _, pred = model(data)

            # data -> GPU
            #_, pred = model(data.cuda())
            
            # Hold on to all predictions and labels
            if(task == 'classification'):
                all_preds.extend(pred[:,1])
            elif(task == 'regression'):
                all_preds.extend(pred)
                
            all_labels.extend(data.y)
        
        if(task == 'classification'):
            metric = ras(data.y.cpu(), pred.cpu().detach()[:,1])
            print("{0} ROC: {1:.3f}".format(set_str, metric))
        elif(task == 'regression'):
            metric = mse(data.y.cpu(), pred.cpu().detach(), squared=False)
            print("{0} RMSE: {1:.3f}".format(set_str, metric))


In [12]:
evaluate(model, valid_loader, True)
evaluate(model, test_loader)

VALIDATION RMSE: 13.071
TEST RMSE: 11.050


### Model saving/loading example

In [13]:
# Save model
os.makedirs("./saved_models/", exist_ok=True)
torch.save(model.state_dict(), "./saved_models/example_gcn")

In [14]:
# Instantiate new model and evaluate
model = GCN(task=dataset.task)

# For GPU, uncomment the following line
#model.cuda()

evaluate(model, test_loader)

TEST RMSE: 6.691


In [15]:
# Load saved model and evaluate
model.load_state_dict(torch.load("./saved_models/example_gcn"))
evaluate(model, test_loader)

TEST RMSE: 11.050
