In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"

In [4]:
tokenized_text = tokenizer.tokenize(marked_text)

In [5]:
print (tokenized_text)

['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]']


In [6]:
tokens = tokenizer.encode_plus(text, add_special_tokens=True)
tokens

{'input_ids': [101, 2182, 2003, 1996, 6251, 1045, 2215, 7861, 8270, 4667, 2015, 2005, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
len(list(tokenizer.vocab))

30522

In [8]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [10]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [11]:
texts = ["Ram and Shayam are playing."]
         
         # "The bank vault was robust.",
         # "He had to bank on her for support.",
         # "The bank was out of money.",
         # "The bank teller was a man."]

In [12]:
target_word_embeddings = []
for text in texts:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    
    # Find the position 'bank' in list of tokens
    # word_index = tokenized_text.index('bank')
    # Get the embedding for bank
    # word_embedding = list_token_embeddings[word_index]
    break

In [13]:
len(list_token_embeddings), tokenized_text

(9, ['[CLS]', 'ram', 'and', 'shay', '##am', 'are', 'playing', '.', '[SEP]'])

In [14]:
len(list_token_embeddings[0])

768

In [15]:
list_token_embeddings[1][:10] #ram

[0.18178808689117432,
 -0.42435935139656067,
 -0.33612990379333496,
 -0.9720714688301086,
 0.7758705615997314,
 -0.2639223039150238,
 0.8645139932632446,
 0.1322741061449051,
 0.08310992270708084,
 0.45797717571258545]

In [16]:
list_token_embeddings[3][:10] #shay

[1.0274667739868164,
 -0.047883614897727966,
 0.17032091319561005,
 -1.0732817649841309,
 -0.06276001036167145,
 -1.7902220487594604,
 1.1028976440429688,
 0.08875294774770737,
 0.4522388279438019,
 0.44802847504615784]

In [17]:
list_token_embeddings[4][:10] #shay

[-0.06891278177499771,
 -0.8458523750305176,
 -0.5103632807731628,
 -1.2111579179763794,
 1.299895167350769,
 -0.9131978750228882,
 0.5076037049293518,
 0.4665069580078125,
 0.11558946967124939,
 0.5410099029541016]