In [1]:
!pip install transformers


[notice] A new release of pip available: 22.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

- BERT is a pretrained model that expects that the input data is in a specific format.
- The data format is as follows:
1. [SEP] is a special token that is used to mark the end of a sentence or the seperation between two sentences.
2. [CLS] is a special token that marks the beginning of the text. 
3. Tokens must conform with the fixed vocab. used in BERT
4. TokenIDs should be retrieved from BertTokenizer
5. MaskIDs which mark tokens vs padding elements
6. SegmentID represents different sentences.
7. Position Embeddings used to show token position within the sequence.


In [3]:
corpus = [
    "The quick brown fox jumping over the lazy dog. The dog barked at the fox extract embeddings.",
    "The fox ran away quickly.",
    "The dog is lazy.",
    "The fox is cunning.",
]


In [4]:
sample_sentence = corpus[0]
## Step 1: Adding the special token
## Adding a CLS is simple
sample_sentence = "[CLS] "+sample_sentence
sample_sentence
## To add the SEP:
split_sentence = sample_sentence.split('.')
sample_sentence=" [SEP].".join(split_sentence)
sample_sentence

'[CLS] The quick brown fox jumping over the lazy dog [SEP]. The dog barked at the fox extract embeddings [SEP].'

In [10]:
## To tokenize the text, BERT provides its own tokenizer
## Which we already imported
tokenized_sentence = tokenizer.tokenize(sample_sentence)
tokenized_sentence

['[CLS]',
 'the',
 'quick',
 'brown',
 'fox',
 'jumping',
 'over',
 'the',
 'lazy',
 'dog',
 '[SEP]',
 '.',
 'the',
 'dog',
 'barked',
 'at',
 'the',
 'fox',
 'extract',
 'em',
 '##bed',
 '##ding',
 '##s',
 '[SEP]',
 '.']

Observations: Some words has been split into smaller subwords and characters. The <b>two hash signs</b> preceding some of these subwords are just the tokenizer’s way to denote that this subword or character is part of a larger word and preceded by another subword. So, for example, the ‘##bed’ token is separate from the ‘bed’ token; the first is used whenever the subword ‘bed’ occurs within a larger word and the second is used explicitly for when the standalone token ‘thing you sleep on’ occurs.



Why?
BERT is created using WordPiece model, where WordPiece model generated a vocabulary that contains all English characters plus the ~30,000 most common words and subwords found in the English language corpus the model is trained on.

In [11]:
list(tokenizer.vocab.keys())[4000:4015]


['tears',
 'senate',
 '00',
 'card',
 'asian',
 'agent',
 '1947',
 'software',
 '44',
 'draw',
 'warm',
 'supposed',
 'com',
 'pro',
 '##il']

In [12]:
## To convert the tokens into Vocab Indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
indexed_tokens

[101,
 1996,
 4248,
 2829,
 4419,
 8660,
 2058,
 1996,
 13971,
 3899,
 102,
 1012,
 1996,
 3899,
 17554,
 2012,
 1996,
 4419,
 14817,
 7861,
 8270,
 4667,
 2015,
 102,
 1012]

In [14]:
for tup in zip(tokenized_sentence, indexed_tokens):
    # Spacing can be added in formatted text using :<and size of spacing
    print('{:<10} :{:^10}'.format(tup[0], tup[1]))

[CLS]      :   101    
the        :   1996   
quick      :   4248   
brown      :   2829   
fox        :   4419   
jumping    :   8660   
over       :   2058   
the        :   1996   
lazy       :  13971   
dog        :   3899   
[SEP]      :   102    
.          :   1012   
the        :   1996   
dog        :   3899   
barked     :  17554   
at         :   2012   
the        :   1996   
fox        :   4419   
extract    :  14817   
em         :   7861   
##bed      :   8270   
##ding     :   4667   
##s        :   2015   
[SEP]      :   102    
.          :   1012   


In [15]:
## Now we need to assign segment ID to tokens
segments_ids = [1] * len(tokenized_sentence)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Extracting Embeddings

In [16]:
# We need to convert the data we have into torch tensors, and then
# Load the model
torch_tensor = torch.tensor([indexed_tokens])
segment_tensor = torch.tensor([segments_ids])

model=BertModel.from_pretrained('bert-base-uncased',
                                output_hidden_states=True) 
## output_hidden_states tells the model to return all hidden states
# Put the model in "evaluation" mode [feed-forward operation]
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [17]:
# Run the text through BERT, and collect all of the hidden states produce from the 12 layers. 
with torch.no_grad():

    outputs = model(torch_tensor, segment_tensor)


    # Because output_hidden_states is set to true, the third item will be hidden
    # states from all layers, different configurations could be used when
    # calling from_pretrained
    hidden_states = outputs[2]
    

In [18]:
# Understanding the output
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")

print ("Number of batches:", len(hidden_states[0]))

print ("Number of tokens:", len(hidden_states[0][0]))

print ("Number of hidden units:", len(hidden_states[0][0][0]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 25
Number of hidden units: 768


- Conclusion: You can make use of a torch of size [25, 13, 768 ]
So, for each token, ( 13,768) vector can represent it.

In [20]:
print(hidden_states[0])

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.4367,  0.5360, -0.0514,  ..., -0.0397,  0.6783, -0.5318],
         [ 0.0764, -1.3345, -0.1239,  ..., -0.5126,  1.4597, -0.7255],
         ...,
         [-0.4762, -0.0660,  0.4777,  ...,  0.0764, -0.2680,  0.1501],
         [-0.2840,  0.2271,  0.0111,  ..., -0.1397,  0.0289, -0.2636],
         [-0.3635,  0.4525, -0.2429,  ...,  0.5196,  0.5170,  0.5195]]])
