In [None]:
#hide
#skip
! [ -e /content ] && pip install -Uqq mrl-pypi  # upgrade mrl on colab

In [None]:
# default_exp model_zoo

# Model Zoo

> Standard Pretrained Models

In [None]:
#hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
# export
from mrl.imports import *
from mrl.core import *
from mrl.chem import *
from mrl.torch_imports import *
from mrl.torch_core import *
from mrl.layers import *
from mrl.dataloaders import *
from mrl.vocab import *
from mrl.g_models.all import *
from mrl.train.agent import *

from torch.utils.model_zoo import load_url
from torch.hub import download_url_to_file

  return f(*args, **kwds)


## Model Zoo

This module contains a set of standard models with pre-trained weights

In [None]:
# export 

S3_PREFIX = 'https://dmai-mrl.s3.amazonaws.com/mrl_public'

def model_from_url(weight_filename):
    location = f'{S3_PREFIX}/{weight_filename}'
    return load_url(location, map_location='cpu')

In [None]:
# export

class PretrainedGenerativeAgent(GenerativeAgent):
    '''
    PretrainedGenerativeAgent - base `GenerativeAgent` 
    variant for pretrained models
    
    Inputs:

    - `weight_filename str`: filename to grab from S3

    - `model nn.Module`: model

    - `vocab Vocab`: vocabulary

    - `loss_function Callable`: loss function for supervised training. Should
    function as `loss = loss_function(model_output, y)`

    - `dataset Base_Dataset`: dataset

    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`

    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    
    '''
    def __init__(self,
                 weight_filename,
                 model,
                 vocab,
                 loss_function,
                 dataset,
                 base_update=0.97,
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name='pretrained_model'
                ):
        
        self.weight_filename = weight_filename
        location = f'{S3_PREFIX}/{weight_filename}'
        model.load_state_dict(load_url(location, map_location='cpu'))
        self.weight_filepath = f"{torch.hub.get_dir()}/checkpoints/{self.weight_filename}"
        
        super().__init__(model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )
        
    def reload_weights(self):
        self.model.load_state_dict(
            torch.load(self.weight_filepath))

        if isinstance(self.base_model, nn.Module):
            self.base_model.load_state_dict(
                torch.load(self.weight_filepath))


## LSTM LM

Models based on `LSTM_LM` class

In [None]:
# export

def lstm_lm_small(vocab, drop_scale=1.):
    '''
    lstm_lm_small - small LSTM_LM model
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
    
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    d_embedding = 256
    d_hidden = 1024
    n_layers = 3
    bidir = False
    tie_weights = True
    
    input_dropout = 0.3*drop_scale
    lstm_dropout = 0.3*drop_scale
    
    model = LSTM_LM(d_vocab, 
                    d_embedding,
                    d_hidden, 
                    n_layers,
                    input_dropout,
                    lstm_dropout,
                    bos_idx, 
                    bidir, 
                    tie_weights)
    
    return model

def lstm_lm_large(vocab, drop_scale=1.):
    '''
    lstm_lm_large - large LSTM_LM model
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
    
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    d_embedding = 400
    d_hidden = 1552
    n_layers = 5
    bidir = False
    tie_weights = True
    
    
    input_dropout = 0.3*drop_scale
    lstm_dropout = 0.3*drop_scale
    
    model = LSTM_LM(d_vocab, 
                    d_embedding,
                    d_hidden, 
                    n_layers,
                    input_dropout,
                    lstm_dropout,
                    bos_idx, 
                    bidir, 
                    tie_weights)
    
    return model

In [None]:
vocab = CharacterVocab(SMILES_CHAR_VOCAB)
model = lstm_lm_small(vocab)
assert isinstance(model, nn.Module)
model = lstm_lm_large(vocab)
assert isinstance(model, nn.Module)

In [None]:
# export

class LSTM_LM_Small_ZINC(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_ZINC - small `LSTM_LM` model 
    trained on a chunk of the ZINC library
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_zinc'
                ):
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_zinc.pt'
        
        dataset = Text_Dataset(['C'], vocab)
        loss_function = CrossEntropy()
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )


In [None]:
# slow

agent = LSTM_LM_Small_ZINC()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

agent.reload_weights()

In [None]:
# export

class LSTM_LM_Small_Chembl(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Chembl - small `LSTM_LM` model 
    trained on a chunk of the Chembl library
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_chembl'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_chembl.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_Chembl()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_ZINC_NC(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_ZINC_NC - small `LSTM_LM` model 
    trained on a chunk of the ZINC library with 
    no chirality markers (ie no '@' symbols)
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_zinc_nc'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB, prefunc=remove_stereo, postfunc=remove_stereo)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_zinc_nc.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )
        

In [None]:
# slow

agent = LSTM_LM_Small_ZINC_NC()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_Chembl_NC(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Chembl_NC - small `LSTM_LM` model 
    trained on a chunk of the Chembl library with 
    no chirality markers (ie no '@' symbols)
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_chembl_nc'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB, prefunc=remove_stereo, postfunc=remove_stereo)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_chembl_nc.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_Chembl_NC()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export
        
class LSTM_LM_Small_ZINC_Selfies(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_ZINC_Selfies - small `LSTM_LM` model 
    trained on a chunk of the ZINC library using 
    SELFIES-type tokenization
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_zinc_selfies'
                ):
        
        vocab = FuncVocab(SELFIES_VOCAB, split_selfie, 
                  prefunc=smile_to_selfie, postfunc=selfie_to_smile)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_zinc_selfies.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_ZINC_Selfies()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export
        
class LSTM_LM_Small_Chembl_Selfies(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Chembl_Selfies - small `LSTM_LM` model 
    trained on a chunk of the Chembl library using 
    SELFIES-type tokenization
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_chembl_selfies'
                ):
        
        vocab = FuncVocab(SELFIES_VOCAB, split_selfie, 
                  prefunc=smile_to_selfie, postfunc=selfie_to_smile)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_chembl_selfies.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )
        

In [None]:
# slow

agent = LSTM_LM_Small_Chembl_Selfies()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_Rgroup(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Rgroup - small `LSTM_LM` model 
    trained on R-groups. R groups are SMILES with 
    the format `*R`
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_rgroup'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_rgroup.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_Rgroup()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_Linkers(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Linkers - small `LSTM_LM` model 
    trained on linkers. Linkes are SMILES with 
    the format `*R*`
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_linkers'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB, prefunc=remove_stereo)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_linkers.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )
        

In [None]:
# slow

agent = LSTM_LM_Small_Linkers()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_Linkers_Mapped(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Linkers_Mapped - small `LSTM_LM` 
    model trained on linkers with mapping tokens for 
    compatibility with `LinkerBlockTemplate`. Linkers 
    are SMILES with the format `*R*`
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_linkers_mapped'
                ):
        
        vocab = CharacterReplaceVocab(SMILES_CHAR_VOCAB, 
                                replace_dict={'[2*:1]':'X', '[2*:2]':'Y'},
                                prefunc=remove_stereo)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_linkers_mapped.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_Linkers_Mapped()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_Swissprot(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_Swissprot - small `LSTM_LM` 
    model trained on the Swissprot protein dataset. 
    This model was trained on protein sequences 
    of 650 amino acids or fewer
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_swissprot'
                ):
        
        vocab = CharacterVocab(AMINO_ACID_VOCAB)
        
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_swissprot.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_Swissprot()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
proteins = to_protein(smiles)
proteins = [i for i in proteins if i is not None]
assert len(proteins)>80

In [None]:
# export

class LSTM_LM_Small_PI1M(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_PI1M - small `LSTM_LM` model 
    trained on a chunk of the PI1M polymer library
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_pi1m'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_pi1m.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['*C*'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_PI1M()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_HGenome(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_HGenome - small `LSTM_LM` model 
    trained on 400bp chunks of the human genome
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_hgenome'
                ):
        
        vocab = CharacterVocab(NUCLEIC_ACID_VOCAB)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_hgenome.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_HGenome()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_dnas(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class LSTM_LM_Small_HGenome_3Mer(PretrainedGenerativeAgent):
    '''
    LSTM_LM_Small_HGenome_3Mer - small `LSTM_LM` model 
    trained on 400bp chunks of the human genome with 
    3-mer tokenization
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'lstmlm_small_hgenome_3mer'
                ):
        
        vocab = KmerVocab(DNA_TRIMERS, 3)
        model = lstm_lm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'lstmlm_small_hgenome_3mer.pt'
        
        loss_function = CrossEntropy()
        dataset = Text_Dataset(['C'], vocab)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = LSTM_LM_Small_HGenome_3Mer()

preds, _ = agent.model.sample_no_grad(100, 33)
smiles = agent.reconstruct(preds)
mols = to_dnas(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

## Conditional LSTM LM

Models based on `Conditional_LSTM_LM`

In [None]:
# export

def cond_lstm_small(vocab, encoder, drop_scale=1.):
    '''
    cond_lstm_small - small conditional LSTM_LM model
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
    
    - `encoder nn.Module`: encoder module
    
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    
    d_latent = 512
    d_embedding = 256
    d_hidden = 1024
    n_layers = 3
    bidir = False
    tie_weights = True
    condition_hidden = True
    condition_output = False
    norm_latent = True
    
    input_dropout = 0.3*drop_scale
    lstm_dropout = 0.3*drop_scale
    
    model = Conditional_LSTM_LM(encoder, 
                                d_vocab, 
                                d_embedding, 
                                d_hidden, 
                                d_latent, 
                                n_layers,
                                input_dropout, 
                                lstm_dropout, 
                                norm_latent, 
                                condition_hidden, 
                                condition_output, 
                                bos_idx)
    
    return model

def cond_lstm_large(vocab, encoder, drop_scale=1.):
    '''
    cond_lstm_large - large conditional LSTM_LM model
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
    
    - `encoder nn.Module`: encoder module
    
    - `drop_scale float`: scale dropout values
    '''

    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    
    d_latent = 512
    d_embedding = 400
    d_hidden = 1552
    n_layers = 5
    bidir = False
    tie_weights = True
    condition_hidden = True
    condition_output = False
    norm_latent = True
    
    
    input_dropout = 0.3*drop_scale
    lstm_dropout = 0.3*drop_scale
    
    model = Conditional_LSTM_LM(encoder, 
                                d_vocab, 
                                d_embedding, 
                                d_hidden, 
                                d_latent, 
                                n_layers,
                                input_dropout, 
                                lstm_dropout, 
                                norm_latent, 
                                condition_hidden, 
                                condition_output, 
                                bos_idx)
    
    return model

def mlp_cond_lstm_small(vocab, drop_scale=1.):
    '''
    mlp_cond_lstm_small - small conditional 
    LSTM_LM model with MLP encoder
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
        
    - `drop_scale float`: scale dropout values
    '''
    enc_drops = [0.1*drop_scale, 0.1*drop_scale]
        
    encoder = MLP_Encoder(2048, [1024, 512], 512, enc_drops)
    return cond_lstm_small(vocab, encoder, drop_scale=drop_scale)

def mlp_cond_lstm_large(vocab, drop_scale=1.):
    '''
    mlp_cond_lstm_large - large conditional 
    LSTM_LM model with MLP encoder
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
        
    - `drop_scale float`: scale dropout values
    '''
    enc_drops = [0.2*drop_scale, 
                 0.2*drop_scale, 
                 0.2*drop_scale, 
                 0.2*drop_scale]
    
    encoder = MLP_Encoder(2048, [1024, 512, 512, 512], 512, [0.2, 0.2, 0.2, 0.2])
    return cond_lstm_small(vocab, encoder, drop_scale=drop_scale)

In [None]:
vocab = CharacterVocab(SMILES_CHAR_VOCAB)
model = mlp_cond_lstm_small(vocab)
assert isinstance(model, nn.Module)
model = mlp_cond_lstm_large(vocab)
assert isinstance(model, nn.Module)

In [None]:
# export

class FP_Cond_LSTM_LM_Small_ZINC(PretrainedGenerativeAgent):
    '''
    FP_Cond_LSTM_LM_Small_ZINC - small 
    `Conditional_LSTM_LM` model trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using a chunk of the ZINC database
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_cond_lstmlm_small_zinc'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = mlp_cond_lstm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_cond_lstmlm_small_zinc.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_Cond_LSTM_LM_Small_ZINC()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class FP_Cond_LSTM_LM_Small_ZINC_Selfies(PretrainedGenerativeAgent):
    '''
    FP_Cond_LSTM_LM_Small_ZINC_Selfies - small 
    `Conditional_LSTM_LM` model trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using a chunk of the ZINC database with 
    SELFIES encoding
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_cond_lstmlm_small_zinc_selfies'
                ):
        
        vocab = FuncVocab(SELFIES_VOCAB, split_selfie, 
                    prefunc=smile_to_selfie, postfunc=selfie_to_smile)
        model = mlp_cond_lstm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_cond_lstmlm_small_zinc_selfies.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_Cond_LSTM_LM_Small_ZINC_Selfies()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class FP_Cond_LSTM_LM_Small_Chembl(PretrainedGenerativeAgent):
    '''
    FP_Cond_LSTM_LM_Small_Chembl - small 
    `Conditional_LSTM_LM` model trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using the Chembl database
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_cond_lstmlm_small_chembl'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = mlp_cond_lstm_small(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_cond_lstmlm_small_chembl.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_Cond_LSTM_LM_Small_Chembl()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

## VAE

Models based on `VAE`

In [None]:
# export

def mlp_vae(vocab, drop_scale=1.):
    '''
    mlp_vae - VAE with MLP encoder
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
        
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    
    d_embedding = 256
    encoder_d_in = 2048
    encoder_dims = [1024, 512]
    d_hidden = 1024
    n_layers = 3
    d_latent =512
    condition_hidden=True
    condition_output=True
    
    encoder_drops = [0.2*drop_scale, 0.2*drop_scale]
    input_dropout=0.3*drop_scale
    lstm_dropout=0.3*drop_scale

    model = MLP_VAE(
                d_vocab,
                d_embedding,
                encoder_d_in,
                encoder_dims,
                encoder_drops,
                d_hidden,
                n_layers,
                d_latent,
                input_dropout=input_dropout,
                lstm_dropout=lstm_dropout,
                condition_hidden=condition_hidden,
                condition_output=condition_output,
                bos_idx=bos_idx,
            )
    
    return model
    
def conv_vae(vocab, drop_scale=1.):
    '''
    mlp_vae - VAE with convolutional encoder
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
        
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    
    d_embedding = 256
    conv_filters = [256, 512, 512]
    kernel_sizes = [7, 7, 7]
    strides = [2, 2, 2]
    d_hidden = 1024
    n_layers = 3
    d_latent = 512
    condition_hidden=True
    condition_output=True
    
    conv_drops = [0.2*drop_scale, 0.2*drop_scale, 0.2*drop_scale]
    input_dropout=0.3*drop_scale
    lstm_dropout=0.3*drop_scale
    
    model = Conv_VAE(
                    d_vocab,
                    d_embedding,
                    conv_filters,
                    kernel_sizes,
                    strides,
                    conv_drops,
                    d_hidden,
                    n_layers,
                    d_latent,
                    input_dropout=input_dropout,
                    lstm_dropout=lstm_dropout,
                    condition_hidden=condition_hidden,
                    condition_output=condition_output,
                    bos_idx=bos_idx)
    
    return model
    
def lstm_vae(vocab, drop_scale=1.):
    '''
    mlp_vae - VAE with LSTM encoder
    
    Inputs:
    
    - `vocab Vocab`: vocab to use
        
    - `drop_scale float`: scale dropout values
    '''
    
    d_vocab = len(vocab.itos)
    bos_idx = vocab.stoi['bos']
    
    d_embedding = 256
    d_hidden = 1024
    n_layers = 3
    d_latent = 512
    condition_hidden=True
    condition_output=True
    
    input_dropout=0.3*drop_scale
    lstm_dropout=0.3*drop_scale
    
    model = LSTM_VAE(
                    d_vocab,
                    d_embedding,
                    d_hidden,
                    n_layers,
                    d_latent,
                    input_dropout=input_dropout,
                    lstm_dropout=lstm_dropout,
                    condition_hidden=condition_hidden,
                    condition_output=condition_output,
                    bos_idx=bos_idx,
                )

    return model

In [None]:
vocab = CharacterVocab(SMILES_CHAR_VOCAB)
model = mlp_vae(vocab)
assert isinstance(model, nn.Module)
model = conv_vae(vocab)
assert isinstance(model, nn.Module)
model = lstm_vae(vocab)
assert isinstance(model, nn.Module)

In [None]:
# export

class FP_VAE_ZINC(PretrainedGenerativeAgent):
    '''
    FP_VAE_ZINC - MLP-to-LSTM VAE trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using the ZINC database
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_vae_zinc'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = mlp_vae(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_vae_zinc.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_VAE_ZINC()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class FP_VAE_Chembl(PretrainedGenerativeAgent):
    '''
    FP_VAE_Chembl - MLP-to-LSTM VAE trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using the Chembl database
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_vae_chembl'
                ):
        
        vocab = CharacterVocab(SMILES_CHAR_VOCAB)
        model = mlp_vae(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_vae_chembl.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_VAE_Chembl()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80

In [None]:
# export

class FP_VAE_ZINC_Selfies(PretrainedGenerativeAgent):
    '''
    FP_VAE_ZINC_Selfies - MLP-to-LSTM VAE trained to 
    reconstruct SMILES from a ECFP6 fingerprint 
    using the ZINC database with SELFIES encoding
    
    Inputs:
    
    - `drop_scale float`: dropout scale
    
    - `base_update float`: update fraction for the baseline model. Updates
    the base model following `base_model = base_update*base_model + (1-base_update)*model`
    
    - `base_update_iter int`: update frequency for baseline model

    - `base_model bool`: if False, baseline model will not be created

    - `opt_kwargs dict`: dictionary of keyword arguments passed to `optim.Adam`

    - `clip float`: gradient clipping

    - `name str`: agent name
    '''
    def __init__(self, 
                 drop_scale=1.,
                 base_update=0.97, 
                 base_update_iter=5,
                 base_model=True,
                 opt_kwargs={},
                 clip=1.,
                 name = 'fp_vae_zinc_selfies'
                ):
        
        vocab = FuncVocab(SELFIES_VOCAB, split_selfie, 
                    prefunc=smile_to_selfie, postfunc=selfie_to_smile)
        
        model = mlp_vae(vocab, drop_scale=drop_scale)
        weight_filename = 'fp_vae_zinc_selfies.pt'
        
        loss_function = CrossEntropy()
        fp_function = partial(failsafe_fp, fp_function=ECFP6)
        dataset = Vec_To_Text_Dataset(['C'], vocab, fp_function)
        
        
        super().__init__(weight_filename,
                         model,
                         vocab, 
                         loss_function,
                         dataset,
                         base_update=base_update,
                         base_update_iter=base_update_iter,
                         base_model=base_model,
                         opt_kwargs=opt_kwargs,
                         clip=clip,
                         name=name
                         )

In [None]:
# slow

agent = FP_VAE_ZINC_Selfies()

preds, _ = agent.model.sample_no_grad(100, 100)
smiles = agent.reconstruct(preds)
mols = to_mols(smiles)
mols = [i for i in mols if i is not None]
assert len(mols)>80