In [1]:
from transformers import BertModel, BertTokenizer
import torch

In [2]:
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased')]

In [3]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

In [4]:
tokenized = tokenizer.tokenize('This is a sentence', add_special_tokens=True)
tokenized

['this', 'is', 'a', 'sentence']

In [5]:
indexes = tokenizer.convert_tokens_to_ids(tokenized)
indexes

[2023, 2003, 1037, 6251]

In [6]:
init_token = tokenizer.cls_token_id
eos_token = tokenizer.sep_token_id
pad_token = tokenizer.pad_token_id
unk_token = tokenizer.unk_token_id

print(init_token)
print(eos_token)
print(pad_token)
print(unk_token)

101
102
0
100


In [7]:
# encode encapsulates all of the above
tokenizer.encode('This is a sentence', add_special_tokens=True)

[101, 2023, 2003, 1037, 6251, 102]

In [8]:
# Encode text
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
with torch.no_grad():
    last_hidden_states = bert_model(input_ids)[0]  # Models outputs are now tuples


In [9]:
last_hidden_states.shape

torch.Size([1, 9, 768])