In [None]:
# default_exp vocab

# Vocab

> Functions related to converting strings into tensors

In [None]:
#hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
# export
from mrl.imports import *

## Tokenization

Tokenzation defines how we break text strings (ie SMILES strings) down into subunits that are fed to the model. The standard process goes as follows:
1. A tokenization process breaks a string down into tokens
2. Tokens are mapped to integers
3. The token integers are sent to the model

This brings up the problem of how best to tokenize smiles. The following methods are implemented out of the box:

### Character Tokenization

Character Tokenization is when we break down SMILES by character. This is implemented with the `tokenize_by_character` function.

```
tokenize_by_character('CC[NH]CC')
>> ['C', 'C', '[', 'N', 'H', ']', 'C', 'C']
```

This form of tokenization is quick and simple. One drawback of this approach is some characters might be overloaded. For example, `Br` is tokenized to `['B', 'r']`, leading to the `B` token meaning both boron (in the standard context) and Bromine (in the `Br` context). In practice, this isn't much of an issue. Language models are particularly adept at learning co-location of tokens.

### Character Tokenization with Replacement

Character tokenization with replacement is the same as character tokenization except we add a dictionary of multi-character tokens to be replaced with singel-character tokens. This dictinary has the form `{multi_character_token : single_character_token}`. Before tokenizing by character, all instances of `multi_character_token` are replaced with `single_character_token`. Character Tokenization with Replacement is implemented with the `tokenize_with_replacements` function.

```
replacement_dict = {'Br' : 'R', 'Cl' : 'L'}
tokenize_with_replacements('[Cl]CC[Br]', replacement_dict)
>> ['[', 'L', ']', 'C', 'C', '[', 'R', ']']
```

### Regex Tokenization

Regex tokenization uses a regex string to decompose SMILES. This is mainly used to keep bracketed terms (ie `[O-]`) as single tokens. This method avoids character overloading by keeping all bracketed terms as individual tokens, but has issues with generating a large number of low frequency tokens. Regex tokenization is implemented with the `regex_tokenize` function

```
SMILE_REGEX = """(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|H|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"""
regex_tokenize('CCC[Br]', re.compile(SMILE_REGEX))
>>['C', 'C', 'C', '[Br]']
```



In [None]:
# export

SMILES_CHAR_VOCAB = ['#', '(', ')', '+', '-', '/', '0',
                 '1', '2', '3', '4', '5', '6', '7',
                 '8', '9', '=', '@', 'B', 'C', 'F', 'H',
                 'I', 'N', 'O', 'P', 'S', '[', '\\',
                 ']', 'c', 'i', 'l', 'n', 'o', 'r', 's',
                 '*', ':', '.', 'a', 'K', 'e']


SPECIAL_TOKENS = ['bos', 'eos', 'pad', 'unk']

MAPPING_TOKENS = ['[1*:1]', '[2*:1]', '[1*:2]', '[2*:2]', '[1*:3]',
                  '[2*:3]', '[1*:4]', '[2*:4]', '[1*:5]', '[2*:5]']

HALOGEN_REPLACE = {'Br':'R',
                   'Cl':'L'}

MAPPING_REPLACE = {'[1*:1]':'A',
                   '[2*:1]':'D',
                   '[1*:2]':'E',
                   '[2*:2]':'G',
                   '[1*:3]':'J',
                   '[2*:3]':'M',
                   '[1*:4]':'Q',
                   '[2*:4]':'T',
                   '[1*:5]':'U', 
                   '[2*:5]':'V'}

AMINO_ACID_VOCAB = ['A', 'C', 'D', 'E', 'F',
                     'G', 'H', 'I', 'K', 'L',
                     'M', 'N', 'P', 'Q', 'R',
                     'S', 'T', 'V', 'W', 'Y']

SELFIES_VOCAB = ['[C]', '[Ring1]', '[=C]', '[Branch1_1]',
             '[N]', '[Branch1_2]', '[=O]', '[O]', '[Branch2_1]',
             '[=N]', '[Ring2]', '[C@Hexpl]', '[C@@Hexpl]', '[F]',
             '[S]', '[Branch1_3]', '[Branch2_2]', '[Branch2_3]', '[#C]',
             '[Expl=Ring1]', '[P]', '[Cl]', '[NHexpl]', '[Br]',
             '[/C]', '[C@expl]', '[C@@expl]', '[#N]', '[O-expl]',
             '[N+expl]', '[Expl=Ring2]', '[\\C]', '[=S]', '[I]',
             '[S@expl]', '[S@@expl]', '[=N+expl]', '[/N]', '[/Cl]',
             '[\\Cl]', '[/O]', '[/S]', '[Siexpl]', '[\\S]',
             '[=S@expl]', '[=S@@expl]', '[\\N]', '[/C@@Hexpl]', '[/C@Hexpl]',
             '[\\O]', '[\\C@Hexpl]', '[\\C@@Hexpl]', '[B]', '[/F]',
             '[/C@expl]', '[\\C@expl]', '[CHexpl]', '[\\F]', '[P@expl]',
             '[Cexpl]', '[/C@@expl]', '[\\C@@expl]', '[=P]', '[P@@expl]',
             '[/NH+expl]', '[/S-expl]', '[=NH+expl]', '[N-expl]', '[NH+expl]',
             '[NH2+expl]', '[NH3+expl]', '[S-expl]', '[\\NHexpl]', '[\\O-expl]', 
             '[\\S-expl]']

# includes tokens that appear <500 times in a dataset of 79 million compounds
SELFIES_EXPANDED_VOCAB = ['[C]', '[Ring1]', '[=C]',
             '[Branch1_1]', '[N]', '[Branch1_2]', '[=O]', '[O]', '[Branch2_1]',
             '[=N]', '[Ring2]', '[C@Hexpl]', '[C@@Hexpl]', '[F]', '[S]',
             '[Branch1_3]', '[Branch2_2]', '[Branch2_3]', '[#C]', '[Expl=Ring1]', '[P]',
             '[Cl]', '[NHexpl]', '[Br]', '[/C]', '[C@expl]', '[C@@expl]',
             '[#N]', '[O-expl]', '[N+expl]', '[Expl=Ring2]', '[\\C]', '[=S]',
             '[I]', '[S@expl]', '[S@@expl]', '[=N+expl]', '[/N]', '[/Cl]',
             '[\\Cl]', '[/O]', '[/S]', '[Siexpl]', '[\\S]', '[=S@expl]',
             '[=S@@expl]', '[\\N]', '[/C@@Hexpl]', '[/C@Hexpl]', '[\\O]', '[\\C@Hexpl]',
             '[\\C@@Hexpl]', '[B]', '[/F]', '[/C@expl]', '[\\C@expl]', '[CHexpl]',
             '[\\F]', '[P@expl]', '[Cexpl]', '[/C@@expl]', '[\\C@@expl]', '[=P]',
             '[P@@expl]', '[/Br]', '[=N-expl]', '[/N+expl]', '[S+expl]', '[\\NHexpl]',
             '[\\Br]', '[/NHexpl]', '[N@+expl]', '[/S@expl]', '[N@@+expl]', '[N-expl]',
             '[/S@@expl]', '[CH2expl]', '[=P@expl]', '[Oexpl]', '[Snexpl]', '[\\S@expl]',
             '[C-expl]', '[/B]', '[\\N+expl]', '[#N+expl]', '[=P@@expl]', 
             '[/NH+expl]', '[/S-expl]', '[=NH+expl]', '[N-expl]', '[NH+expl]',
             '[NH2+expl]', '[NH3+expl]', '[S-expl]', '[\\NHexpl]', '[\\O-expl]', 
             '[\\S-expl]', '[CH-expl]',
             '[\\O-expl]', '[Expl/Ring2]', '[/Oexpl]', '[B-expl]', '[S@@+expl]', '[=S+expl]',
             '[P+expl]', '[/O-expl]', '[PHexpl]', '[=S@+expl]', '[P@@Hexpl]', '[\\I]',
             '[Expl/Ring1]', '[Expl\\Ring2]', '[S@+expl]', '[/I]', '[Nexpl]', '[=B]',
             '[=O+expl]', '[O+expl]', '[CH2-expl]', '[B@-expl]', '[=S@@+expl]', '[B@@-expl]',
             '[\\B]', '[/S+expl]', '[SHexpl]', '[\\S@@expl]', '[\\P@@expl]', '[/P@expl]',
             '[=P@@Hexpl]', '[\\P@expl]', '[/P@@expl]', '[/Siexpl]', '[=17Oexpl]', '[=Nexpl]',
             '[I+expl]', '[=P@Hexpl]', '[\\Snexpl]', '[\\C-expl]', '[=SHexpl]', '[\\Siexpl]',
             '[SnH4+2expl]', '[Sn+expl]', '[=Snexpl]', '[=P+expl]', '[C+expl]', '[N@@H+expl]',
             '[Sn+3expl]', '[/C-expl]', '[/Cexpl]', '[BH3-expl]', '[\\CH-expl]', '[=Siexpl]',
             '[/CHexpl]', '[/Snexpl]', '[BH2-expl]', '[\\Cexpl]', '[\\P]', '[=PHexpl]',
             '[#N+expl]', '[#NH+expl]', '[#PHexpl]', '[#P]', '[#Pexpl]', '[#SHexpl]',
             '[#S]', '[#Sexpl]', '[/Br]', '[/CHexpl]', '[/Cexpl]', '[/N+expl]',
             '[/NHexpl]', '[/O-expl]', '[/PHexpl]', '[/P]', '[/SHexpl]', '[=CHexpl]', '[=Cexpl]',
             '[=N-expl]', '[=P+expl]', '[=P@@expl]', '[=P@expl]', '[=PHexpl]', '[=Pexpl]',
             '[=S-expl]', '[=SHexpl]', '[=Sexpl]', '[=Siexpl]', '[Expl#Ring1]', '[Expl#Ring2]',
             '[Expl/Ring1]', '[Expl/Ring2]', '[Expl\\Ring1]', '[Expl\\Ring2]', '[P+expl]', '[PHexpl]',
             '[Pexpl]', '[SHexpl]', '[Sexpl]', '[\\Br]', '[\\CHexpl]', '[\\Cexpl]',
             '[\\I]', '[\\N+expl]', '[\\SHexpl]', '[\\Siexpl]']


In [None]:
def pad_vocab(vocab):
    if not len(vocab)%8==0:
        final_length = np.ceil(len(vocab)/8)*8
        to_add = len(vocab) - final_length
        vocab = vocab + ['extra']*to_add
        
    return vocab

These are regex patterns to decompose smiles into tokens

`SMILE_REGEX` is based off [this work](https://github.com/pschwllr/MolecularTransformer/blob/master/README.md). The pattern decomposes SMILES into individual characters, but keeps `Cl`, `Br`, and any term in brackets (ie `[O-]`) intact. 

`MAPPING_REGEX` is a derivative of `SMILE_REGEX` designed to work with the mapping framework used with the `Block` class. `MAPPING_REGEX` keeps `Cl`, `Br`, and any string of the form `[{isotope}*:{map_num}]` intact

In [None]:
# export

SMILE_REGEX = """(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|H|\(|\)|\.|=|
                 #|-|\+|\\\\|\/|:|~|@|\?|>|#|\*|\$|\%[0-9]{2}|[0-9])"""

MAPPING_REGEX = """(\[.\*:.]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|H|\[|\]|\(|\)|\.|=|
                    #|-|\+|\\\\|\/|:|~|@|\?|>|#|\*|\$|\%[0-9]{2}|[0-9])"""

AA_MAPPING_REGEX = """(\[.\*:.]|A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)"""

In [None]:
# export

def tokenize_by_character(input):
    "Splits `input` into inividual characters"
    unks = False
    if 'unk' in input:
        input = input.replace('unk', '_')
        unks = True
    tokens = [i for i in input]
    if unks:
        for i, item in enumerate(tokens):
            if item=='_':
                tokens[i] = 'unk'
    return tokens

def tokenize_with_replacements(input, replacement_dict):
    "Replaces substrings in `input` using `replacement_dict`, then tokenizes by character"
    for k,v in replacement_dict.items():
        input = input.replace(k,v)
    return [i for i in input]

def regex_tokenize(input, regex):
    'Uses `regex` to tokenize `input`'
    tokens = [token for token in regex.findall(input)]
    return tokens

In [None]:
assert tokenize_by_character('CCC[Br]') == ['C', 'C', 'C', '[', 'B', 'r', ']']
assert tokenize_with_replacements('CCC[Br]', HALOGEN_REPLACE) == ['C', 'C', 'C', '[', 'R', ']']
assert regex_tokenize('CCC[Br]', re.compile(SMILE_REGEX)) == ['C', 'C', 'C', '[Br]']
assert regex_tokenize('[1*:1]CCC[Br]', re.compile(MAPPING_REGEX)) == ['[1*:1]', 'C', 'C', 'C', '[', 'Br', ']']

## Vocabulary

The `Vocab` class handles tokenization. `Vocab.tokenize` breaks strings down into tokens. `Vocab.numericalize` maps tokens to integers. `Vocab.reconstruct` converts integers back into strings.

`Vocab` holds `itos`, a list of tokens, and `stoi`, a dictionary mapping tokens to integers. `Vocab` automatically adds four special tokens `['bos', 'eos', 'pad', 'unk']` indicating beginning of sentence, end of sentence, padding and unknown.

### Custom Vocbulary

To implement custom tokenization, subclass `Vocab` and update the `tokenize`, `numericalize` and `reconstruct` methods. Use the `test_reconstruction` function to verify your custom vocab can successfully reconstruct sequences.

In [None]:
# export

class Vocab():
    '''
    Vocab - base vocabulary class
    
    Inputs:
    
        `itos` - list, list of tokens in vocabulary
        
        `prefunc` - None, Callable, function applied to `input` before tokenization
        
        `postfunc` - None, Callable, function applied to `input` after reconstruction
        
    '''
    def __init__(self, itos, prefunc=None, postfunc=None):
        self.special_tokens = ['bos', 'eos', 'pad', 'unk']
        
        self.itos = self.special_tokens + [i for i in itos if not i in self.special_tokens]
        self.stoi = {self.itos[i]:i for i in range(len(self.itos))}
        self.unks = set()
        self.prefunc = prefunc
        self.postfunc = postfunc
        
    def _tokenize(self, input):
        'Tokenize `input`'
        raise NotImplementedError
        
    def tokenize(self, input):
        input = self.preprocess(input)
        toks = self._tokenize(input)
        toks = ['bos'] + toks + ['eos']
        return toks
    
    def join_tokens(self, tokens):
        return ''.join(tokens)
        
    def preprocess(self, input):
        if self.prefunc is not None:
            input = self.prefunc(input)
        return input
    
    def postprocess(self, input):
        if self.postfunc is not None:
            input = self.postfunc(input)
        return input
        
    def numericalize(self, input):
        'Numericalize `input` into integers'
        output = []
        for tok in input:
            if tok in self.stoi.keys():
                output.append(self.stoi[tok])
            else:
                output.append(self.stoi['unk'])
                self.unks.add(tok)
        return output
    
    def _reconstruct(self, input):
        'Reconstruct `input` into a string'
        output = []
        for item in input:
            item = self.itos[item]
            if item=='eos':
                break
                
            if (not item=='bos') and (not item=='pad'):
                output.append(item)
        
        return output
    
    def reconstruct(self, input):
        tokens = self._reconstruct(input)
        output = self.join_tokens(tokens)
        output = self.postprocess(output)
        return output
    
    def reconstruct_trajectory(self, input):
        tokens = self._reconstruct(input)
        return [self.join_tokens(tokens[:i]) for i in range(1,len(tokens)+1)]
                
    def update_vocab(self):
        'Adds tokens in `self.unks` to vocabulary'
        unks = list(self.unks)
        self.itos += unks
        self.stoi = {self.itos[i]:i for i in range(len(self.itos))}
        self.unks = set()
        
    def update_vocab_from_data(self, inputs):
        'Tokenizes `inputs` and updates the vocabulary with any unknown tokens'
        _ = [self.numericalize(self.tokenize(i)) for i in inputs]
        self.update_vocab()
        
        
class CharacterVocab(Vocab):
    '''
    CharacterVocab - tokenize by character
    
    Inputs:
    
        `itos` - list, list of tokens in vocabulary
        
        `prefunc` - None, Callable, function applied to `input` before tokenization
        
        `postfunc` - None, Callable, function applied to `input` after reconstruction
    '''
    def _tokenize(self, input):
        toks = tokenize_by_character(input)
        return toks

class FuncVocab(Vocab):
    '''
    FuncVocab - tokenize by `tok_func`
    
    Inputs:
    
        `itos` - list, list of tokens in vocabulary
        
        `tok_func` - Callable, tokenization function
        
        `prefunc` - None, Callable, function applied to `input` before tokenization
        
        `postfunc` - None, Callable, function applied to `input` after reconstruction
    '''
    
    def __init__(self, itos, tok_func, prefunc=None, postfunc=None):
        super().__init__(itos, prefunc, postfunc)
        self.tok_func = tok_func
    
    def _tokenize(self, input):
        toks = self.tok_func(input)
        return toks
    
    
class SelfiesVocab(FuncVocab):
    '''
    SelfiesVocab - converts smiles to selfies 
    
    Inputs:
    
        `itos` - list, list of tokens in vocabulary
    '''
    def __init__(self, itos):
        super().__init__(itos, split_selfie, smile_to_selfie, selfie_to_smile)
        
    
class CharacterReplaceVocab(Vocab):
    '''
    CharacterReplaceVocab - tokenize by character with replacement
    
    Inputs:
    
        `itos` - list, list of tokens
        
        `replace_dict` - dict, replacement dictionary of the form {multi_character_token : single_character_token}. 
        ie replace_dict={'Br':'R', 'Cl':'L'}
        
        `prefunc` - None, Callable, function applied to `input` before tokenization
        
        `postfunc` - None, Callable, function applied to `input` after reconstruction
        
    '''
    def __init__(self, itos, replace_dict, prefunc=None, postfunc=None):
        itos = list(itos)
        self.replace_dict = replace_dict
        if not 'unk' in self.replace_dict.keys():
            self.replace_dict['unk'] = '_'
        
        self.reverse_dict = {v:k for k,v in replace_dict.items()}
        for rep in self.reverse_dict.keys():
            if not rep in itos:
                itos.append(rep)
        super().__init__(itos, prefunc, postfunc)
        
    def _tokenize(self, smile):
        toks = tokenize_with_replacements(smile, self.replace_dict)
        return toks
    
    def _reconstruct(self, input):
        output = []
        for item in input:
            item = self.itos[item]
            if item=='eos':
                break
            
            if (not item=='bos') and (not item=='pad'):
                if item in self.reverse_dict.keys():
                    item = self.reverse_dict[item]

                output.append(item)
        
        return output
    
class RegexVocab(Vocab):
    '''
    RegexVocab - tokenize using `pattern`
    
    Inputs:
    
        `itos` - list, list of tokens
        
        `pattern` - str, regex string
        
        `prefunc` - None, Callable, function applied to `input` before tokenization
        
        `postfunc` - None, Callable, function applied to `input` after reconstruction
        
    '''
    def __init__(self, itos, pattern, prefunc=None, postfunc=None):
        super().__init__(itos, prefunc, postfunc)
        
        self.pattern = pattern
        self.regex = re.compile(self.pattern)
        
    def _tokenize(self, smile):
        toks = regex_tokenize(smile, self.regex)
        return toks

In [None]:
# export

def test_reconstruction(vocab, inputs):
    "Returns all items in `inputs` that can't be correctly reconstructed using `vocab`"
    fails = []
    for item in inputs:
        recon = vocab.reconstruct(vocab.numericalize(vocab.tokenize(item)))
        if not item==recon:
            fails.append((item, recon))
            
    return fails

In [None]:
df = pd.read_csv('files/smiles.csv')
smiles = df.smiles.values

In [None]:
vocab = CharacterVocab(SMILES_CHAR_VOCAB)
assert test_reconstruction(vocab, smiles)==[]

In [None]:
vocab = FuncVocab(SMILES_CHAR_VOCAB, tokenize_by_character)
assert test_reconstruction(vocab, smiles)==[]

In [None]:
vocab = CharacterReplaceVocab(SMILES_CHAR_VOCAB, HALOGEN_REPLACE)
assert vocab.tokenize('CC[Br]') == ['bos', 'C', 'C', '[', 'R', ']', 'eos']
assert test_reconstruction(vocab, smiles)==[]

In [None]:
vocab = RegexVocab(SMILES_CHAR_VOCAB, SMILE_REGEX)
assert vocab.tokenize('CC[Br]') == ['bos', 'C', 'C', '[Br]', 'eos']
vocab.update_vocab_from_data(smiles)
assert test_reconstruction(vocab, smiles)==[]

In [None]:
# hide
from nbdev.export import notebook2script; notebook2script()