In [1]:
import csv
import os
import sys
import numpy as np
import torch
import pickle
import copy
import pandas as pd

from logging import Logger
from typing import List
from tqdm import trange

from torch.optim.lr_scheduler import ExponentialLR
from torch_geometric.datasets import QM9

In [2]:
# cd to chempropBayes
%cd /Users/georgelamb/Documents/GitHub/chempropBayes

/Users/georgelamb/Documents/GitHub/chempropBayes


In [3]:
# import from chempropBayes
from chemprop.train.evaluate import evaluate, evaluate_predictions
from chemprop.train.predict import predict
from chemprop.train.train import train
from chemprop.train.run_training import run_training
from chemprop.args import TrainArgs, HyperoptArgs
from chemprop.data import StandardScaler, MoleculeDataLoader
from chemprop.data.utils import get_class_sizes, get_data, get_task_names, split_data
from chemprop.models import MoleculeModel
from chemprop.nn_utils import param_count
from chemprop.utils import build_optimizer, build_lr_scheduler, get_loss_func, get_metric_func, load_checkpoint,\
    makedirs, save_checkpoint, save_smiles_splits

from hyperparameter_optimization import grid_search

# import SWAG functions / classes
from chemprop.bayes import SWAG

In [4]:
# instantiate args class and load from dict
args = TrainArgs()
args.from_dict({
    'dataset_type': 'regression',
    'data_path': '/Users/georgelamb/Documents/GitHub/chempropBayes/data/QM9.csv'
})

# location for model checkpoints to be saved
args.save_dir = '/Users/georgelamb/Documents/GitHub/chempropBayes/log'

In [5]:
### args (non-model)

# seed for splitting and loading data
args.seed = 0

# data
args.max_data_size = 50000
args.features_path = None
args.features_generator = None

# splitting data
args.split_type = 'random'
args.split_sizes = (0.8, 0.1, 0.1)

# evaluation metric
args.metric = 'mae'

# epochs and logging
args.epochs = 50
args.log_frequency = 800

### args (model)

# seed for random initial weights
args.pytorch_seed = 0

# message passing
args.atom_messages = False
args.undirected = False
args.bias = False
args.hidden_size = 500
args.depth = 5

# FFN
args.ffn_hidden_size = args.hidden_size
args.ffn_num_layers = 3

# shared
args.activation = 'ReLU'


debug start

In [6]:
args.num_tasks = 12
model = MoleculeModel(args)

In [7]:
model = train_swag(model, train_data_loader, loss_func, args)

NameError: name 'train_swag' is not defined

In [9]:
swag_model = SWAG(
    model,
    no_cov_mat=True,
    max_num_models=12,
    var_clamp=1e-30
)

In [10]:
swag_model.sample()

In [13]:
model = train_swag(model, train_data_loader, loss_func, args)

MoleculeModel(
  (encoder): MPN(
    (encoder): MPNEncoder(
      (dropout_layer): Dropout(p=0.0, inplace=False)
      (act_func): ReLU()
      (W_i): Linear(in_features=147, out_features=500, bias=False)
      (W_h): Linear(in_features=500, out_features=500, bias=False)
      (W_o): Linear(in_features=633, out_features=500, bias=True)
    )
  )
  (ffn): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_features=500, out_features=500, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.0, inplace=False)
    (4): Linear(in_features=500, out_features=500, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.0, inplace=False)
    (7): Linear(in_features=500, out_features=12, bias=True)
  )
)

In [12]:
swag_model.eval()

SWAG(
  (base): MoleculeModel(
    (encoder): MPN(
      (encoder): MPNEncoder(
        (dropout_layer): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=500, bias=False)
        (W_h): Linear(in_features=500, out_features=500, bias=False)
        (W_o): Linear(in_features=633, out_features=500, bias=True)
      )
    )
    (ffn): Sequential(
      (0): Dropout(p=0.0, inplace=False)
      (1): Linear(in_features=500, out_features=500, bias=True)
      (2): ReLU()
      (3): Dropout(p=0.0, inplace=False)
      (4): Linear(in_features=500, out_features=500, bias=True)
      (5): ReLU()
      (6): Dropout(p=0.0, inplace=False)
      (7): Linear(in_features=500, out_features=12, bias=True)
    )
  )
)

In [10]:
for i in base_model.parameters():
    print(i)

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.

In [11]:
for i in swag_model.params:
    print(i)

(MPNEncoder(
  (dropout_layer): Dropout(p=0.0, inplace=False)
  (act_func): ReLU()
  (W_i): Linear(in_features=147, out_features=500, bias=False)
  (W_h): Linear(in_features=500, out_features=500, bias=False)
  (W_o): Linear(in_features=633, out_features=500, bias=True)
), 'cached_zero_vector')
(Linear(in_features=147, out_features=500, bias=False), 'weight')
(Linear(in_features=500, out_features=500, bias=False), 'weight')
(Linear(in_features=633, out_features=500, bias=True), 'weight')
(Linear(in_features=633, out_features=500, bias=True), 'bias')
(Linear(in_features=500, out_features=500, bias=True), 'weight')
(Linear(in_features=500, out_features=500, bias=True), 'bias')
(Linear(in_features=500, out_features=500, bias=True), 'weight')
(Linear(in_features=500, out_features=500, bias=True), 'bias')
(Linear(in_features=500, out_features=12, bias=True), 'weight')
(Linear(in_features=500, out_features=12, bias=True), 'bias')


In [12]:
for (module, name), base_param in zip(swag_model.params, base_model.parameters()):
    mean = module.__getattr__("%s_mean" % name)
    sq_mean = module.__getattr__("%s_sq_mean" % name)
    print(module)
    print(name)
    
    # first moment
    print(mean)
    print(base_param.data)
    print('----')

MPNEncoder(
  (dropout_layer): Dropout(p=0.0, inplace=False)
  (act_func): ReLU()
  (W_i): Linear(in_features=147, out_features=500, bias=False)
  (W_h): Linear(in_features=500, out_features=500, bias=False)
  (W_o): Linear(in_features=633, out_features=500, bias=True)
)
cached_zero_vector
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [36]:
for (module, name), base_param in zip(self.params, base_model.parameters()):
    mean = module.__getattr__("%s_mean" % name)
    sq_mean = module.__getattr__("%s_sq_mean" % name)

    # first moment
    mean = mean * self.n_models.item() / (
        self.n_models.item() + 1.0
    ) + base_param.data / (self.n_models.item() + 1.0)

    # second moment
    sq_mean = sq_mean * self.n_models.item() / (
        self.n_models.item() + 1.0
    ) + base_param.data ** 2 / (self.n_models.item() + 1.0)

    # square root of covariance matrix
    if self.no_cov_mat is False:
        cov_mat_sqrt = module.__getattr__("%s_cov_mat_sqrt" % name)

        # block covariance matrices, store deviation from current mean
        dev = (base_param.data - mean).view(-1, 1)
        cov_mat_sqrt = torch.cat((cov_mat_sqrt, dev.view(-1, 1).t()), dim=0)

        # remove first column if we have stored too many models
        if (self.n_models.item() + 1) > self.max_num_models:
            cov_mat_sqrt = cov_mat_sqrt[1:, :]
        module.__setattr__("%s_cov_mat_sqrt" % name, cov_mat_sqrt)

    module.__setattr__("%s_mean" % name, mean)
    module.__setattr__("%s_sq_mean" % name, sq_mean)
self.n_models.add_(1)

RuntimeError: The size of tensor a (147) must match the size of tensor b (500) at non-singleton dimension 1

In [191]:
# applies swag_parameters function to each submodule of base (loop over submodules is implicit)
swag_model.base.apply(
    lambda module: swag_parameters(
        module=module, params=swag_model.params, no_cov_mat=swag_model.no_cov_mat
    )
);

In [13]:
swag_model.params[0][0]

Linear(in_features=147, out_features=500, bias=False)

In [14]:
swag_model.base.encoder.encoder.W_i

Linear(in_features=147, out_features=500, bias=False)

In [12]:
swag_model.sample()

In [152]:
def swag_parameters(module, params, no_cov_mat=True):
    """
    module: submodule of base model
    params: list of params
    no_cov_mat: True means SWAG DIAG
    """
    
    # for each module (i.e. layer), list is a list of the parameter groups e.g. ['weight', 'bias'] or ['c z v']
    # we loop through the names of each list
    for name in list(module._parameters.keys()):
        
        # if there are no parameters associated with the name, continue
        if module._parameters[name] is None:
            continue
            
        # data is an extracted parameter vector    
        data = module._parameters[name].data
        

    print('done one layer thing')

In [188]:
def swag_parameters(module, params, no_cov_mat=True):
    """
    module: submodule of base model
    params: list of params
    no_cov_mat: True means SWAG DIAG
    """
    
    # for each module (i.e. layer), list is a list of the parameter groups e.g. ['weight', 'bias'] or ['c z v']
    # we loop through the names of each list
    for name in list(module._parameters.keys()):
        
        # if there are no parameters associated with the name, continue
        if module._parameters[name] is None:
            continue
            
        # data is an extracted parameter vector    
        data = module._parameters[name].data
        
        # we're removing name and associated params from module._parameters
        module._parameters.pop(name)
        
        # registers buffers for first moment and second moment
        # initialised at zero?
        module.register_buffer("%s_mean" % name, data.new(data.size()).zero_())
        module.register_buffer("%s_sq_mean" % name, data.new(data.size()).zero_())
        
        # if full SWAG, registers D (sqrt of covariance matrix)
        if no_cov_mat is False:
            module.register_buffer(
                "%s_cov_mat_sqrt" % name, data.new_empty((0, data.numel())).zero_()
            )

        # append to list of SWAG parameters
        params.append((module, name))
        
        
        
        



In [71]:
def swag_parameters(module, params, no_cov_mat=True):
    """
    module: submodule of base model
    params: list of params
    no_cov_mat: True means SWAG DIAG
    """
    for name in list(module._parameters.keys()):
        if module._parameters[name] is None:
            continue
        data = module._parameters[name].data
        module._parameters.pop(name)
        
        # registers buffers for first moment and second moment
        # initialised at zero?
        module.register_buffer("%s_mean" % name, data.new(data.size()).zero_())
        module.register_buffer("%s_sq_mean" % name, data.new(data.size()).zero_())
        
        # if full SWAG, registers D (sqrt of covariance matrix)
        if no_cov_mat is False:
            module.register_buffer(
                "%s_cov_mat_sqrt" % name, data.new_empty((0, data.numel())).zero_()
            )

        # append to list of SWAG parameters
        params.append((module, name))

In [None]:
def collect_model(self, base_model):
    """
    updates first moment, second moment and sqrt of cov matrix
    """
    for (module, name), base_param in zip(self.params, base_model.parameters()):
        mean = module.__getattr__("%s_mean" % name)
        sq_mean = module.__getattr__("%s_sq_mean" % name)

        # first moment
        mean = mean * self.n_models.item() / (
            self.n_models.item() + 1.0
        ) + base_param.data / (self.n_models.item() + 1.0)

        # second moment
        sq_mean = sq_mean * self.n_models.item() / (
            self.n_models.item() + 1.0
        ) + base_param.data ** 2 / (self.n_models.item() + 1.0)

        # square root of covariance matrix
        if self.no_cov_mat is False:
            cov_mat_sqrt = module.__getattr__("%s_cov_mat_sqrt" % name)

            # block covariance matrices, store deviation from current mean
            dev = (base_param.data - mean).view(-1, 1)
            cov_mat_sqrt = torch.cat((cov_mat_sqrt, dev.view(-1, 1).t()), dim=0)

            # remove first column if we have stored too many models
            if (self.n_models.item() + 1) > self.max_num_models:
                cov_mat_sqrt = cov_mat_sqrt[1:, :]
            module.__setattr__("%s_cov_mat_sqrt" % name, cov_mat_sqrt)

        module.__setattr__("%s_mean" % name, mean)
        module.__setattr__("%s_sq_mean" % name, sq_mean)
    self.n_models.add_(1)