In [1]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from a PyTorch BERT model."""
import logging

import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# The convention in BERT is:
# (a) For sequence pairs:
#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
#  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
# (b) For single sequences:
#  tokens:   [CLS] the dog is hairy . [SEP]
#  type_ids:   0   0   0   0  0     0   0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.

In [4]:
def prepare_bert(sentences, max_seq_length, tokenizer):    
    all_input_ids = []
    all_input_mask = []
    for sentence in sentences:
        # tokenizer will also separate on punctuation
        # see https://github.com/google-research/bert#tokenization
        tokens = tokenizer.tokenize(sentence)
        
        # limit size of tokens
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[0:(max_seq_length - 2)]
        
        # add [CLS] and [SEP], as expected in BERT
        tokens = ['[CLS]', *tokens, '[SEP]']
        
        input_type_ids = [0] * len(tokens)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        
    return all_input_ids, all_input_mask


def main(sentences, layers='-1, -2, -3, -4', max_seq_length=512, bert_model='bert-large-uncased', 
         do_lower_case=True, batch_size=32, no_cuda=False):
    device = torch.device('cuda' if torch.cuda.is_available() and not no_cuda else 'cpu')
    
    # 'layers' indicates which layers we want to concatenate
    layer_idxs = [int(l) for l in layers.split(',')]
    
    # init tokenizer
    tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
    
    # returns a list of 'InputFeatures'
    bert_features = prepare_bert(sentences, max_seq_length, tokenizer)
    all_input_ids, all_input_mask = (torch.tensor(feat, dtype=torch.long) for feat in bert_features)
    
    # init model and move to device
    model = BertModel.from_pretrained(bert_model)
    model.to(device)    
    
    # prepare dataset and dataloader
    eval_data = TensorDataset(all_input_ids, all_input_mask)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

    model.eval()
    
    for input_ids, input_mask in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)

        layers_to_concat = [all_encoder_layers[idx] for idx in layer_idxs]
        
        concat = torch.cat(layers_to_concat, dim=-1)
                
        logger.info(concat.size())
        logger.info(concat)
        
        # Pooling by also setting masked sequence items to zero
        # Concat shape is [3, 32, 4096]
        # Add 3rd dimensions to mask so that it is [3, 32, 1]
        input_mask = input_mask.to(torch.float).unsqueeze(2)
        # Multiply output with mask 
        pooled = concat * input_mask
        # Sum items in sequence to get sentence representation
        summed = torch.sum(pooled, dim=1).squeeze()
        # Average over seq_length
        divved = torch.div(summed, max_seq_length)
        
        logger.info(divved.size())
        logger.info(divved)
        
        # OR (but don't do this, as it will ignore masks)
        # Pooling by simple average pool
        concat_permute = concat.permute(0, 2, 1)
        
        pooled = F.avg_pool1d(concat_permute, kernel_size=concat_permute.size(2)).squeeze()
        
        logger.info(pooled.size())
        logger.info(pooled)

In [5]:
if __name__ == "__main__":
    proc_args = {
        'sentences': ['I saw Bert today !', 'Do you like bananas ?', 'Some sentences are really horrendous to parse .'],
        'max_seq_length': 32
    }
    main(**proc_args)

03/26/2019 15:44:37 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\Bram\.pytorch_pretrained_bert\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/26/2019 15:44:38 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz from cache at C:\Users\Bram\.pytorch_pretrained_bert\214d4777e8e3eb234563136cd3a49f6bc34131de836848454373fa43f10adc5e.abfbb80ee795a608acbf35c7bf2d2d58574df3887cdd94b355fc67e03fddba05
03/26/2019 15:44:38 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\Users\Bram\.pytorch_pretrained_bert\214d4777e8e3eb234563136cd3a49f6bc34131de836848454373fa43f10adc5e.abfbb80ee795a608acbf35c7bf2d2d58574df3887cdd94b355fc67e03fddba05 to temp dir C:\Users\Bram\AppData\L