This notebook trains and evaluates a prototype D-MPNN on QM9 (regression)

The code is based on run_training.py

D-MPNN settings:
- fff
- fff
- fff

Questions:

- Why do only two weight matrices have a bias switch?
- Why are lists used rather than numpy arrays for data objects?
- Loss function.. are we not dividing by number of molecules twice?

To investigate:

- Why does the property use_input_features not work? Need to look at how args/parsing works.
- only moving onto cuda in message passing?

In [14]:
import csv
import os
import sys
import numpy as np
import torch
import pickle
import copy

from logging import Logger
from typing import List
from tqdm import trange

from torch.optim.lr_scheduler import ExponentialLR
from torch_geometric.datasets import QM9

In [15]:
# cd to chempropBayes
%cd /Users/georgelamb/Documents/GitHub/chempropBayes

/Users/georgelamb/Documents/GitHub/chempropBayes


In [16]:
# import from chempropBayes
from chemprop.train.evaluate import evaluate, evaluate_predictions
from chemprop.train.predict import predict
from chemprop.train.train import train
from chemprop.args import TrainArgs
from chemprop.data import StandardScaler, MoleculeDataLoader
from chemprop.data.utils import get_class_sizes, get_data, get_task_names, split_data
from chemprop.models import MoleculeModel
from chemprop.nn_utils import param_count
from chemprop.utils import build_optimizer, build_lr_scheduler, get_loss_func, get_metric_func, load_checkpoint,\
    makedirs, save_checkpoint, save_smiles_splits

In [18]:
args = TrainArgs()
args.from_dict({
    'dataset_type': 'regression',
    'data_path': '/Users/georgelamb/Documents/GitHub/chempropBayes/data/QM9.csv'
})
logger = None
writer = None

In [19]:
# set pytorch seed for random initial weights
args.pytorch_seed = 0
torch.manual_seed(args.pytorch_seed);

In [20]:
# get data

# specify relevant arguments
args.target_columns = ['mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'cv', 'u0', 'u298', 'h298', 'g298']
args.task_names = args.target_columns # set task name to 'mu'
args.features_path = None # path to load additional features
args.features_generator = None # generator for additional features (cannot specify generator and path)
args.max_data_size = 20000 # number of data points to load

# use get_data function to convert QM9 csv -> MoleculeDataset
# a MoleculeDataset contains a list of molecules and their associated features and targets
data = get_data(path=args.data_path, args=args)

# call MoleculeDataset methods
args.num_tasks = data.num_tasks() # returns number of prediction tasks (number of targets)
args.features_size = data.features_size() # returns size of features array for each molecule (>0 or None)


16209it [00:00, 52220.24it/s]
100%|██████████| 20000/20000 [00:00<00:00, 620463.76it/s]
100%|██████████| 20000/20000 [00:01<00:00, 16269.21it/s]


In [21]:
# split data into train, val, test

# specify relevant arguments
# split type options: 'random', 'scaffold_balanced', 'predetermined', 'crossval', 'index_predetermined'
args.split_type = 'random' # specify split type
args.split_sizes = (0.8, 0.1, 0.1) # tuple of train/val/test proportions
args.seed = 0 # random seed for splitting

# split_data returns three MoleculeDatasets
train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, 
                                             sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
# specify training data size
args.train_data_size = len(train_data)


In [22]:
# standardise training targets (zero mean, unit variance)

args.dataset_type = 'regression'
if args.dataset_type == 'regression':
    train_smiles, train_targets = train_data.smiles(), train_data.targets() # return training targets
    scaler = StandardScaler().fit(train_targets) # fit transform to training targets
    scaled_targets = scaler.transform(train_targets).tolist() # apply transform
    train_data.set_targets(scaled_targets) # replace targets in MoleculeDataset with standardised targets


In [23]:
# get loss and metric functions

loss_func = get_loss_func(args) # returns loss function based on args.dataset_type (MSE for regression)
args.metric = 'mae' # set metric
metric_func = get_metric_func(metric=args.metric) # returns callable metric function based on args.metric


In [24]:
# set up array to house test scores (num molecules X num tasks)

test_smiles, test_targets = test_data.smiles(), test_data.targets() # return test smiles and targets


In [25]:
# create data loaders

# specify relevant arguments
args.cache_cutoff = 10000 # max number of molecules for caching
args.batch_size = 50 # default batch size is 50
args.num_workers = 8 # number of workers for parallel data loading
args.class_balance = False # equal number of pos and neg in each batch (for classification)
args.seed = 0 # seed for data loader shuffle

# determine whether to cache graph featurisations when running batch_graph method
# caching = saving mol_graphs in SMILES_TO_GRAPH
if len(data) <= args.cache_cutoff:
    cache = True
    num_workers = 0
else: # if we don't cache we perform parallel data loading
    cache = False
    num_workers = args.num_workers
    
# instantiate data loaders
# MoleculeDataLoader: an iterable over a MoleculeDataSet which calls MoleculeSampler
# data loader returns a MoleculeDataSet with BatchMolGraph computed using batch_graph method
train_data_loader = MoleculeDataLoader(
        dataset=train_data,
        batch_size=args.batch_size,
        num_workers=num_workers,
        cache=cache,
        class_balance=args.class_balance,
        shuffle=True,
        seed=args.seed
)
val_data_loader = MoleculeDataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        num_workers=num_workers,
        cache=cache
)
test_data_loader = MoleculeDataLoader(
        dataset=test_data,
        batch_size=args.batch_size,
        num_workers=num_workers,
        cache=cache
)


In [26]:
if args.use_input_features:
    print('test')

In [27]:
# build model

# args: additional features
args.features_only = False # if True only the 'additional' features feed the FFN
#args.use_input_features = (args.features_generator is not None or args.features_path is not None)

# args: message passing
args.atom_messages = False # False means bond-centred i.e. D-MPNN
args.undirected = False # if messages are bond-centred we can choose directed or undirected
args.hidden_size = 300 # hidden layer size
args.depth = 3 # number of message passing steps
args.bias = False # only affects W_i and W_h
args.device = torch.device('cpu')

# args: FFN
args.ffn_hidden_size = 300
args.ffn_num_layers = 2

# args: both message passing and FFN
args.dropout = 0.0 # dropout probability
args.activation = 'ReLU'

# MoleculeModel: message passing network followed by feed-forward layers
model = MoleculeModel(args)


In [None]:
# build optimiser and lr scheduler

# optimiser
args.init_lr = 1e-4
optimizer = build_optimizer(model, args)

# lr scheduler (Noam)
# linear increase from init_lr to max_lr during warmup_steps (warmup_epochs * steps_per_epoch)
# steps_per_epoch = train_data_size // batch_size
# exponential decay from max_lr to final_lr over remaining steps
# total epochs = args.epochs * args.num_lrs
args.max_lr = 1e-3
args.final_lr = 1e-4

args.warmup_epochs = 2.0
args.epochs = 30
args.num_lrs = 1 # number of learning rates

scheduler = build_lr_scheduler(optimizer, args)


In [None]:
# run training

save_dir = '/Users/georgelamb/Documents/GitHub/chempropBayes/saved_models'
args.log_frequency = 320 # number of batches between each logging of the training loss
args.show_individual_scores = False
args.minimize_score = True

best_score = float('inf') if args.minimize_score else -float('inf')
best_epoch, n_iter = 0, 0
for epoch in range(args.epochs):
    
    # train for one epoch
    n_iter = train(
        model=model,
        data_loader=train_data_loader,
        loss_func=loss_func,
        optimizer=optimizer,
        scheduler=scheduler,
        args=args,
        n_iter=n_iter,
        logger=logger,
        writer=writer
    )
    
    # evaluation on val set for one epoch
    val_scores = evaluate(
        model=model,
        data_loader=val_data_loader,
        num_tasks=args.num_tasks,
        metric_func=metric_func,
        dataset_type=args.dataset_type,
        scaler=scaler,
        logger=logger
    )
    
    # average validation score
    avg_val_score = np.nanmean(val_scores)
    print(f'Validation AVG {args.metric} = {avg_val_score:.6f}')
    
    # show individual validation scores
    if args.show_individual_scores:
        for task_name, val_score in zip(args.task_names, val_scores):
            print(f'Validation {task_name} {args.metric} = {val_score:.6f}')
            
    # save model if improved validation score
    if args.minimize_score and avg_val_score < best_score or \
            not args.minimize_score and avg_val_score > best_score:
        best_score, best_epoch = avg_val_score, epoch
        best_model = copy.deepcopy(model)


In [None]:
# evaluate on test set

print(f'Best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}')

test_preds = predict(
    model=best_model,
    data_loader=test_data_loader,
    scaler=scaler
)
test_scores = evaluate_predictions(
    preds=test_preds,
    targets=test_targets,
    num_tasks=args.num_tasks,
    metric_func=metric_func,
    dataset_type=args.dataset_type,
    logger=logger
)

# Average test score
avg_test_score = np.nanmean(test_scores)
print(f'Test {args.metric} = {avg_test_score:.6f}')

# Individual test scores
if args.show_individual_scores:
    for task_name, test_score in zip(args.task_names, test_scores):
        print(f'Test {task_name} {args.metric} = {test_score:.6f}')
