In [None]:
# default_exp glp.prediction

%reload_ext autoreload
%autoreload 2

# glp.clustering


In [None]:
import sys
sys.path.append('../')

In [3]:
#hide
#export

from itertools import islice
import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer
from umap import UMAP
from fastai.text.all import *
import hdbscan

from justenough.nlp.core import *
from justenough.explain.core import *

### Loading Data

For this example we're going to use a small dataset of 2000 HIV tat proteins with labels for tissue of isolation and co-receptor binding.
We'll see if the clusters generated by topic modeling will correspond to these groups.

In [4]:
df = pd.read_csv('../tutorials/HIV_tat_example.csv').dropna(subset = ['sample_tissue'])
df['sequence'] = df['sequence'].str.strip('*')
df.head()

Unnamed: 0,accession,sample_tissue,coreceptor,sequence
0,M17449,PBMC,CXCR4,MEPVDPRLEPWKHPGSQPKTACTTCYCKKCCFHCQVCFTKKALGISYGRKKRRQRRRAPEDSQTHQVSLPKQPAPQFRGDPTGPKESKKKVERETETHPVD
1,M26727,PBMC,CCR5,MEPVDPRLEPWKHPGSQPKTASNNCYCKRCCLHCQVCFTKKGLGISYGRKKRRQRRRAPQDSKTHQVSLSKQPASQPRGDPTGPKESKKKVERETETDPED
2,M17451,PBMC,CCR5|CXCR4,MEPVDPRLEPWKHPGSQPKTACNNCYCKKCCYHCQVCFLTKGLGISYGRKKRRQRRGPPQGSQTHQVSLSKQPTSQPRGDPTGPKESKEKVERETETDPAVQ
3,K02007,PBMC,CCR5|CXCR4,MEPVDPNLEPWKHPGSQPRTACNNCYCKKCCFHCYACFTRKGLGISYGRKKRRQRRRAPQDSQTHQASLSKQPASQSRGDPTGPTESKKKVERETETDPFD
4,M62320,blood,,MEPVDPNLEPWKHPGSQPTTACSNCYCKVCCWHCQLCFLKKGLGISYGKKKRKPRRGPPQGSKDHQTLIPKQPLPQSQRVSAGQEESKKKVESKAKTDRFA


In [5]:
model_name = 'Rostlab/prot_bert'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In order to use the HuggingFace tokenizer we need to add spaces between all of the AAs in our sequence.

In [6]:
#export


def space_adder(seq):
    
    return ' '.join(seq)


class SpaceTransform(Transform):
    """Adds spaces between AAs for HuggingFace"""
    
    def encodes(self, x):
        if type(x) == str:
            x = [x]
        return L(space_adder(seq) for seq in x)
    
    def decodes(self, x):
        
        return [seq.replace(' ', '') for seq in x]
        
        
    

In [7]:
test_eq(space_adder('MIVLR'), 'M I V L R')

In [8]:

space_tfm = SpaceTransform()

pipe = Pipeline([space_tfm])

tst = ['MIVLR', 'AAR']
cor = ['M I V L R', 'A A R']

test_eq(pipe(tst), cor)

Next, we need to pass them through the Huggingface Tokenizer.

In [9]:
#export

class HFTokenizerWrapper(Transform):
    
    def __init__(self, tokenizer, tokens_only = True, 
                 truncation = True, max_length = 128,
                 padding = 'max_length', 
                 skip_special_tokens = True,
                 device = 'cuda'): 
        self.tokenizer = tokenizer
        self.tokens_only = tokens_only
        self.truncation = truncation
        self.max_length = max_length
        self.padding = padding
        self.skip_special_tokens = skip_special_tokens
        self.device = device
        
    def encodes(self, x):
        
        if type(x) == str:
            x = [x]
            
        tokenized = self.tokenizer(list(x), 
                                   return_tensors='pt', 
                                   padding=self.padding,
                                   truncation = self.truncation,
                                   max_length = self.max_length)
        tokenized = tokenized.to(self.device)
        
        
        
        if self.tokens_only:
            return tokenized['input_ids']
        else:
            return [fastuple(tokenized['input_ids'][i], tokenized['attention_mask'][i]) for i in range(len(x))]
        
        
    def decodes(self, x):
        
        return self.tokenizer.batch_decode(x, skip_special_tokens = self.skip_special_tokens)
        
        


In [10]:
space_tfm = SpaceTransform()
token_tfm = HFTokenizerWrapper(tokenizer, max_length=7, device = 'cpu')
pipe = Pipeline([space_tfm, token_tfm])

tst = ['MIVLR', 'AAR']
cor = [[2, 21, 11,  8, 5, 13, 3], 
       [2,  6,  6, 13, 3,  0, 0]]

test_eq(pipe(tst), tensor(cor))

In [11]:
test_eq(pipe.decode(tensor(cor)), tst)

In [12]:
token_tfm = HFTokenizerWrapper(tokenizer, max_length=6, tokens_only=False, device = 'cpu')
pipe = Pipeline([space_tfm, token_tfm])
pipe(tst)

[(tensor([ 2, 21, 11,  8,  5,  3]), tensor([1, 1, 1, 1, 1, 1])),
 (tensor([ 2,  6,  6, 13,  3,  0]), tensor([1, 1, 1, 1, 1, 0]))]

Now that we have the tokenized tensors, we can feed them into the model and get their encodings.

In [13]:
# export
from fastprogress.fastprogress import master_bar, progress_bar


class HFPoolingTransform(Transform):
    
    def __init__(self, model, batch_size = 32, progress = False):
        
        self.model = model
        self.batch_size = batch_size
        self.progress = progress
    
    def encodes(self, x):
        
        if type(x[0]) == fastuple:
            input_ids, attention = zip(*x)
            input_ids = torch.vstack(input_ids)
            attention = torch.vstack(attention).type(torch.bool)
        else:
            input_ids = x
            attention = x != 0
            
        with torch.no_grad():
            
            if self.batch_size is not None:
                #print(input_ids.shape)
                out = []
                if self.progress:
                    it = progress_bar(range(0, input_ids.shape[0], self.batch_size))
                else:
                    it = range(0, input_ids.shape[0], self.batch_size)
                for start in it:            
                    res = self.model(input_ids = input_ids[start:start+self.batch_size],
                                     attention_mask = attention[start:start+self.batch_size])
                    out.append(masked_concat_pool(res[0], attention[start:start+self.batch_size], input_ids.shape[1]-1))
                return torch.vstack(out)
            else:
                res = self.model(input_ids = input_ids,
                                 attention_mask = attention)
                return masked_concat_pool(res[0], attention, 
                                          input_ids.shape[1]-1)
        



In [14]:
token_tfm = HFTokenizerWrapper(tokenizer, max_length=6, tokens_only=True, device = 'cuda')
bert_pool_tfm = HFPoolingTransform(model)
pipe = Pipeline([space_tfm, token_tfm, bert_pool_tfm])

encoded = pipe(tst*100)

test_eq(encoded.shape, (200, 3072))

Now, to make our future lives easier, lets encapsulate all of this into a `Dataloders` subclass.

This one can take a dataframe and some parameters and intelligently construct the dataloaders. Allowing for pre-computing and easy setup with autoencoders.

In [15]:
class HFBertDataLoaders(DataLoaders):
    
    @staticmethod
    def from_df(frame, tokenizer, model, sequence_col = 'sequence', label_col = None, vocab=None,
                max_length = 128, device = 'cuda', bs = 32, precompute = True,
                splitter = None, num_workers = 0):
        
        if splitter is None:
            splitter = RandomSplitter()
            
            
        seq_tfms = [ColReader(sequence_col),
                    SpaceTransform(),
                    HFTokenizerWrapper(tokenizer, 
                                       max_length=max_length, 
                                       tokens_only=False, 
                                       device = device),
                    HFPoolingTransform(model,batch_size=bs)]
        if label_col is None:
            label_tfms = seq_tfms
        else:
            label_tfms = [ColReader(label_col), Categorize(vocab=vocab)]
            
        
        if precompute:
            
            seq_pipe = Pipeline(seq_tfms)
            seq_tls = seq_pipe(frame)
            
            if label_col is None:
                label_tls = seq_tls
            else:
                label_tls = TfmdLists(frame, label_tfms)
                
            tls = TfmdLists(zip(seq_tls, label_tls), [])
            train, test = splitter(tls)
            
            return DataLoaders.from_dsets(tls[train], tls[test], num_workers=0).to(device)
            
            
        else:
            
            train, test = splitter(frame)
            feat_tls = Datasets(frame, [seq_tfms, label_tfms],
                               splits = (train, test))
            
            dls = feat_tls.dataloaders(num_workers=0).to(device)
            
            return dls

This can be easily used to create a Dataloader from any dataframe.

Like so. If no label-column is provided, it assumes this is an autoencoding pre-training task.

In [16]:
dls = HFBertDataLoaders.from_df(df, tokenizer, model, 
                                label_col=None, precompute=True)
dls.one_batch()

(tensor([[ 0.0184,  0.1971, -0.1281,  ..., -0.0300,  0.0073,  0.0132],
         [ 0.0614,  0.2085, -0.1526,  ..., -0.0509, -0.0295, -0.0242],
         [ 0.1123,  0.1572, -0.1147,  ..., -0.0799, -0.0502, -0.0264],
         ...,
         [-0.1431,  0.0607, -0.1217,  ..., -0.0705, -0.0164,  0.0204],
         [ 0.0345,  0.2125, -0.1170,  ..., -0.0566, -0.0182,  0.0117],
         [ 0.0395,  0.2265, -0.1417,  ..., -0.0208, -0.0214, -0.0201]],
        device='cuda:0'),
 tensor([[ 0.0184,  0.1971, -0.1281,  ..., -0.0300,  0.0073,  0.0132],
         [ 0.0614,  0.2085, -0.1526,  ..., -0.0509, -0.0295, -0.0242],
         [ 0.1123,  0.1572, -0.1147,  ..., -0.0799, -0.0502, -0.0264],
         ...,
         [-0.1431,  0.0607, -0.1217,  ..., -0.0705, -0.0164,  0.0204],
         [ 0.0345,  0.2125, -0.1170,  ..., -0.0566, -0.0182,  0.0117],
         [ 0.0395,  0.2265, -0.1417,  ..., -0.0208, -0.0214, -0.0201]],
        device='cuda:0'))

In [17]:
# export

def create_bert_head(nf, n_out, lin_ftrs=None, ps=0.5, concat_pool=True, first_bn=True, bn_final=False,
                     lin_first=False, y_range=None):
    "Model head that takes `nf` features, runs through `lin_ftrs`, and out `n_out` classes."
    lin_ftrs = [nf, 512, n_out] if lin_ftrs is None else [nf] + lin_ftrs + [n_out]
    bns = [first_bn] + [True]*len(lin_ftrs[1:])
    ps = L(ps)
    if len(ps) == 1: ps = [ps[0]/2] * (len(lin_ftrs)-2) + ps
    actns = [nn.ReLU(inplace=True)] * (len(lin_ftrs)-2) + [None]
    #pool = AdaptiveConcatPool2d() if concat_pool else nn.AdaptiveAvgPool2d(1)
    layers = [Flatten()]
    if lin_first: layers.append(nn.Dropout(ps.pop(0)))
    for ni,no,bn,p,actn in zip(lin_ftrs[:-1], lin_ftrs[1:], bns, ps, actns):
        layers += LinBnDrop(ni, no, bn=bn, p=p, act=actn, lin_first=lin_first)
    if lin_first: layers.append(nn.Linear(lin_ftrs[-2], n_out))
    if bn_final: layers.append(nn.BatchNorm1d(lin_ftrs[-1], momentum=0.01))
    if y_range is not None: layers.append(SigmoidRange(*y_range))
    return nn.Sequential(*layers)

In [18]:
# export


class ProtBertHead(Module):
    
    def __init__(self, 
                 in_features = 3072, 
                 hidden_dim = 128,
                 out_features = 'autoencoder', 
                 encoder = None,
                 lin_ftrs = [1024], ps = 0.25):
        
        self.in_features = in_features
        self.hidden_dim = hidden_dim
        
        if out_features == 'autoencoder':
            self.out_features = in_features
        else:
            self.out_features = out_features
        
        if encoder is None:
            self.encoder = create_bert_head(in_features, hidden_dim, lin_ftrs = lin_ftrs, ps=ps)
        else:
            self.encoder = encoder
        
        self.decoder = create_bert_head(self.hidden_dim, self.out_features, lin_ftrs = lin_ftrs, ps = ps)
        
    def re_head(self, new_out_features, lin_ftrs = [1024], ps = 0.25):
        
        
        return ProtBertHead(in_features=self.in_features,
                             hidden_dim=self.hidden_dim,
                             out_features = new_out_features,
                             encoder = self.encoder,
                             lin_ftrs = lin_ftrs, ps = ps)
        
        
    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return decoded
        
                

This model can be loaded using a ProtBert Learner.

In [19]:
# export

#@delegates(Learner.__init__)
def protbert_classifier_learner(dls, in_features, 
                                model = None,
                                n_out=None,
                                lin_ftrs=None, 
                                max_len=128, hidden_dim = 128,
                                ps = 0.25,
                                y_range=None, **kwargs):
    "Create a `Learner` with a ProtBert classifier from `dls` and `arch`."
        
    if model is None:
        if n_out == 'autoencoder': n_out = in_features
        if n_out is None: n_out = get_c(dls)
        if n_out is None: n_out = in_features
            
        model = ProtBertHead(in_features = in_features,
                              out_features= n_out,
                              hidden_dim = hidden_dim,
                              lin_ftrs = lin_ftrs, ps = ps)
        
    learn = Learner(dls, model, **kwargs)
    
    return learn


Often it is useful to pre-train with an autodencoder. This model makes that easy.

From the top. Here is a whole set.



In [20]:
df = pd.read_csv('../tutorials/HIV_tat_example.csv').dropna(subset = ['sample_tissue'])
df['sequence'] = df['sequence'].str.strip('*')
df.head()

Unnamed: 0,accession,sample_tissue,coreceptor,sequence
0,M17449,PBMC,CXCR4,MEPVDPRLEPWKHPGSQPKTACTTCYCKKCCFHCQVCFTKKALGISYGRKKRRQRRRAPEDSQTHQVSLPKQPAPQFRGDPTGPKESKKKVERETETHPVD
1,M26727,PBMC,CCR5,MEPVDPRLEPWKHPGSQPKTASNNCYCKRCCLHCQVCFTKKGLGISYGRKKRRQRRRAPQDSKTHQVSLSKQPASQPRGDPTGPKESKKKVERETETDPED
2,M17451,PBMC,CCR5|CXCR4,MEPVDPRLEPWKHPGSQPKTACNNCYCKKCCYHCQVCFLTKGLGISYGRKKRRQRRGPPQGSQTHQVSLSKQPTSQPRGDPTGPKESKEKVERETETDPAVQ
3,K02007,PBMC,CCR5|CXCR4,MEPVDPNLEPWKHPGSQPRTACNNCYCKKCCFHCYACFTRKGLGISYGRKKRRQRRRAPQDSQTHQASLSKQPASQSRGDPTGPTESKKKVERETETDPFD
4,M62320,blood,,MEPVDPNLEPWKHPGSQPTTACSNCYCKVCCWHCQLCFLKKGLGISYGKKKRKPRRGPPQGSKDHQTLIPKQPLPQSQRVSAGQEESKKKVESKAKTDRFA


In [29]:
model_name = 'Rostlab/prot_bert_bfd'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=361.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=81.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=86.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1684058277.0), HTML(value='')))




Autoencoder pre-training

In [30]:
auto_dls = HFBertDataLoaders.from_df(df, tokenizer, model, 
                                     label_col=None, precompute=True)




In [31]:
auto_learner = protbert_classifier_learner(dls, model.config.hidden_size*3, 
                                           n_out = 'autoencoder',
                                           loss_func = nn.MSELoss())
auto_learner.fit_one_cycle(50, lr_max = 0.01, cbs = [EarlyStoppingCallback(patience=1)])
auto_learner.freeze()

epoch,train_loss,valid_loss,time
0,0.2715,0.040663,00:00
1,0.188775,0.032776,00:00
2,0.144123,0.025192,00:00
3,0.106929,0.009672,00:00
4,0.075759,0.007529,00:00
5,0.05183,0.004573,00:00
6,0.034189,0.003384,00:00
7,0.022395,0.002489,00:00
8,0.014916,0.003718,00:00


No improvement since epoch 7: early stopping


In [32]:
label_dls = HFBertDataLoaders.from_df(df, tokenizer, model, 
                                      label_col='sample_tissue', precompute=True)

In [33]:
vocab = df['sample_tissue'].dropna().unique()
label_learner = protbert_classifier_learner(label_dls, None,
                                            model= auto_learner.model.re_head(len(vocab), lin_ftrs=[64, 32]),
                                            loss_func = nn.CrossEntropyLoss(),  metrics=[accuracy])

In [34]:
label_learner.fit_one_cycle(50, lr_max = 0.001, cbs = [EarlyStoppingCallback(patience=2)])

epoch,train_loss,valid_loss,accuracy,time
0,2.454245,2.484555,0.045564,00:00
1,2.386216,2.242213,0.436451,00:00
2,2.31904,2.123848,0.472422,00:00
3,2.240711,2.114523,0.460432,00:00
4,2.171865,2.051955,0.529976,00:00
5,2.108036,2.110823,0.570743,00:00
6,2.017675,1.950459,0.623501,00:00
7,1.945742,1.824279,0.611511,00:00
8,1.846583,1.847966,0.628297,00:00
9,1.737161,1.725049,0.645084,00:00


No improvement since epoch 12: early stopping


In [35]:
label_learner.unfreeze()
label_learner.fit_one_cycle(50, lr_max = 0.001, cbs = [EarlyStoppingCallback(patience=2)])

epoch,train_loss,valid_loss,accuracy,time
0,0.88965,1.413247,0.623501,00:00
1,0.858413,1.375695,0.613909,00:00
2,0.822202,1.377387,0.613909,00:00
3,0.791884,1.411415,0.630695,00:00


No improvement since epoch 1: early stopping
