In [1]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Length of BERT based vocabulary: {len(tokenizer.vocab)}')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertF

Length of BERT based vocabulary: 30522


In [4]:
text = "A simple sentence!"

tokens = tokenizer.encode(text)
tokens

[101, 1037, 3722, 6251, 999, 102]

In [8]:
print(tokenizer.decode(tokens))
for t in tokens:
    print(f"Token: {t}, subword: {tokenizer.decode([t])}")

[CLS] a simple sentence! [SEP]
Token: 101, subword: [CLS]
Token: 1037, subword: a
Token: 3722, subword: simple
Token: 6251, subword: sentence
Token: 999, subword: !
Token: 102, subword: [SEP]


In [10]:
'kkakkavas' in tokenizer.vocab

False

In [11]:
text_with_unk_words = "Kakkavas loves a beautiful day"
tokens_with_unk_words = tokenizer.encode(text_with_unk_words)

for t in tokens_with_unk_words:
    print(f"Token: {t}, subword: {tokenizer.decode([t])}")

Token: 101, subword: [CLS]
Token: 10556, subword: ka
Token: 15714, subword: ##kka
Token: 12044, subword: ##vas
Token: 7459, subword: loves
Token: 1037, subword: a
Token: 3376, subword: beautiful
Token: 2154, subword: day
Token: 102, subword: [SEP]


In [13]:
from pprint import pprint
# Encode plus is the __call__() of the tokenizer
tokens = tokenizer.encode_plus(text_with_unk_words)
pprint(tokens)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101, 10556, 15714, 12044, 7459, 1037, 3376, 2154, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}


# Context Capturing example

In [14]:
python_pet = tokenizer.encode('I love my pet python')
python_language = tokenizer.encode('I love coding in python')

In [31]:
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()
snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()
programming_along_embedding = model(torch.tensor(tokenizer.encode('asdgasdg')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [32]:
python_language_embedding.shape, snake_alone_embedding.shape

((1, 768), (1, 768))

In [33]:
print(cosine_similarity(python_pet_embedding, snake_alone_embedding), cosine_similarity(python_language_embedding, programming_along_embedding))
print(cosine_similarity(python_language_embedding, snake_alone_embedding), cosine_similarity(python_language_embedding, programming_along_embedding))

[[0.6928657]] [[0.27551398]]
[[0.58434784]] [[0.27551398]]
