In [2]:
import csv
import os
import sys
import numpy as np
import torch
import pickle
import copy
import gpytorch

from logging import Logger
from typing import List
from tqdm import trange

from torch.optim.lr_scheduler import ExponentialLR
from torch_geometric.datasets import QM9

# cd to chempropBayes
%cd /Users/georgelamb/Documents/GitHub/chempropBayes

# import from chempropBayes
from chemprop.train.evaluate import evaluate, evaluate_predictions
from chemprop.train.predict import predict
from chemprop.train.train import train
from chemprop.args import TrainArgs
from chemprop.data import StandardScaler, MoleculeDataLoader
from chemprop.data.utils import get_class_sizes, get_data, get_task_names, split_data
from chemprop.models import MoleculeModel
from chemprop.nn_utils import param_count
from chemprop.utils import build_optimizer, build_lr_scheduler, get_loss_func, get_metric_func, load_checkpoint,\
    makedirs, save_checkpoint, save_smiles_splits

/Users/georgelamb/Documents/GitHub/chempropBayes


### Args

In [3]:
### args

# instantiate args class and load from dict
args = TrainArgs()
args.from_dict({
    'dataset_type': 'regression',
    'data_path': '/Users/georgelamb/Documents/GitHub/chempropBayes/data/QM9.csv'
})

# location for model checkpoints to be saved
args.save_dir = '/Users/georgelamb/Documents/GitHub/chempropBayes/log'

### args (non-model)

# seed for splitting and loading data
args.seed = 0

# data
args.max_data_size = 10000
args.features_path = None
args.features_generator = None

# splitting data
args.split_type = 'random'
args.split_sizes = (0.8, 0.1, 0.1)

# evaluation metric
args.metric = 'mae'

### args (model)

# seed for random initial weights
args.pytorch_seed = 0

# message passing
args.atom_messages = False
args.undirected = False
args.bias = False
args.hidden_size = 100
args.depth = 2

# FFN
args.ffn_hidden_size = args.hidden_size
args.ffn_num_layers = 2

# shared
args.activation = 'ReLU'

# batch size
args.batch_size = 50


In [4]:
args.gp = True

### Data

In [5]:
logger = None
torch.manual_seed(args.pytorch_seed)
args.task_names = args.target_columns or get_task_names(args.data_path)
data = get_data(path=args.data_path, args=args, logger=logger)
args.num_tasks = data.num_tasks()
args.features_size = data.features_size()

# split data
train_data, val_data, test_data = split_data(
    data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)

if args.features_scaling:
    features_scaler = train_data.normalize_features(replace_nan_token=0)
    val_data.normalize_features(features_scaler)
    test_data.normalize_features(features_scaler)
else:
    features_scaler = None

args.train_data_size = len(train_data)

if args.dataset_type == 'regression':
    train_smiles, train_targets = train_data.smiles(), train_data.targets()
    scaler = StandardScaler().fit(train_targets)
    scaled_targets = scaler.transform(train_targets).tolist()
    train_data.set_targets(scaled_targets)
else:
    scaler = None

#loss_func = get_loss_func(args)
metric_func = get_metric_func(metric=args.metric)

# Automatically determine whether to cache
if len(data) <= args.cache_cutoff:
    cache = True
    num_workers = 0
else:
    cache = False
    num_workers = args.num_workers

# Create data loaders
train_data_loader = MoleculeDataLoader(
    dataset=train_data,
    batch_size=args.batch_size,
    num_workers=num_workers,
    cache=cache,
    class_balance=args.class_balance,
    shuffle=True,
    seed=args.seed
)
val_data_loader = MoleculeDataLoader(
        dataset=val_data,
        batch_size=args.batch_size,
        num_workers=num_workers,
        cache=cache
)

9114it [00:00, 43769.76it/s]
100%|██████████| 10000/10000 [00:00<00:00, 678327.75it/s]
100%|██████████| 10000/10000 [00:00<00:00, 16695.26it/s]


### GP

In [6]:
class MultitaskGPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points, num_dim):
        
        # We have to mark the CholeskyVariationalDistribution as batch
        # so that we learn a variational distribution for each task
        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
            num_inducing_points = inducing_points.size(-2),
            batch_shape = torch.Size([num_dim])
        )

        
        # We have to wrap the VariationalStrategy in a MultitaskVariationalStrategy
        # so that the output will be a MultitaskMultivariateNormal rather than a batch output
        variational_strategy = gpytorch.variational.MultitaskVariationalStrategy(
            gpytorch.variational.VariationalStrategy(
                self, 
                inducing_points, 
                variational_distribution, 
                learn_inducing_locations=True
            ), num_tasks=num_dim
        )

        super().__init__(variational_strategy)

        # The mean and covariance modules should be marked as batch
        # so we learn a different set of hyperparameters
        self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_dim]))
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_dim])),
            batch_shape=torch.Size([num_dim])
        )

    def forward(self, x):
        # The forward function should be written as if we were dealing with each output
        # dimension in batch
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


In [7]:
class DKLModel(gpytorch.Module):
    
    def __init__(self, feature_extractor, gp_layer):
        
        super(DKLModel, self).__init__()
        
        self.feature_extractor = feature_extractor
        self.gp_layer = gp_layer

    def forward(self, *input):
        
        features = self.feature_extractor(*input)
        res = self.gp_layer(features)
        
        return res


### Instantiate model

In [8]:
x = torch.tensor([[0,1],[5,6]])
x = x.repeat(3,1,1)
x

tensor([[[0, 1],
         [5, 6]],

        [[0, 1],
         [5, 6]],

        [[0, 1],
         [5, 6]]])

In [11]:
inducing_points = []
for batch in train_data_loader:
    mol_batch = batch.batch_graph()
    inducing_points.extend(feature_extractor(mol_batch))
inducing_points = torch.stack(inducing_points)[:2000]
inducing_points = inducing_points.repeat(10,1,1)

# need to turn this into output x m x input
inducing_points.shape

torch.Size([10, 2000, 100])

In [10]:
feature_extractor = MoleculeModel(args, featurizer=True)

In [12]:
# The shape of the inducing points should be (2 x m x 2) - so that we learn different inducing points per output
torch.manual_seed(0)
inducing_points = torch.rand(12, 100, args.hidden_size)
gp_layer = MultitaskGPModel(inducing_points, 12)

In [13]:
model = DKLModel(feature_extractor, gp_layer)

In [14]:
# We're going to use a multitask likeihood with this model
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=12)

### Train GP

In [28]:
model.train()
likelihood.train()

optimizer = torch.optim.Adam([
    {'params': model.feature_extractor.parameters()},
    {'params': model.gp_layer.hyperparameters()},
    {'params': model.gp_layer.variational_parameters()},
    {'params': likelihood.parameters()},
], lr=0.001)

mll = gpytorch.mlls.VariationalELBO(likelihood, model.gp_layer, num_data=args.train_data_size)

for i in range(10):
    model.train()
    likelihood.train()
    
    loss_epoch = 0
    for batch in train_data_loader:
        
        mol_batch, target_batch = batch.batch_graph(), batch.targets()
        
        optimizer.zero_grad()
        output = model(mol_batch)
        loss = -mll(output, torch.tensor(target_batch))
        loss_epoch += loss.item()
        loss.backward()
        optimizer.step()
    print(loss_epoch)
    
    val_scores = evaluate(
                model=model,
                data_loader=val_data_loader,
                args=args,
                num_tasks=args.num_tasks,
                metric_func=metric_func,
                dataset_type=args.dataset_type,
                scaler=scaler,
                logger=logger
            )
    avg_val_score = np.nanmean(val_scores)
    print(f'Validation {args.metric} = {avg_val_score:.6f}')

3045.948022842407
Validation mae = 30.413390
3012.2813262939453
Validation mae = 30.415102
2981.9948749542236
Validation mae = 30.413786
2954.9243297576904
Validation mae = 30.412523
2930.8706817626953
Validation mae = 30.411033
2909.574275970459
Validation mae = 30.412925
2890.752908706665
Validation mae = 30.413056
2874.130563735962
Validation mae = 30.409716
2859.4554080963135
Validation mae = 30.410089
2846.4586391448975
Validation mae = 30.408022


In [31]:
likelihood(model(mol_batch)).variance[0]

tensor([1.2905, 1.2743, 1.2754, 1.2665, 1.2683, 1.2724, 1.2689, 1.2703, 1.2766,
        1.2766, 1.2766, 1.2766], grad_fn=<SelectBackward>)

In [32]:
model(mol_batch).variance[0]

tensor([0.1287, 0.1282, 0.1281, 0.1279, 0.1280, 0.1280, 0.1280, 0.1281, 0.1282,
        0.1282, 0.1282, 0.1282], grad_fn=<SelectBackward>)

In [81]:
likelihood.noise

tensor([0.6932], grad_fn=<AddBackward0>)

In [92]:
gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=12, rank=0, batch_shape=torch.Size([12])).noise

tensor([[0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932],
        [0.6932]], grad_fn=<AddBackward0>)

In [91]:
likelihood.log_noise

AttributeError: 'MultitaskGaussianLikelihood' object has no attribute 'log_noise'