<a href="https://colab.research.google.com/github/AishaEvering/LLM_Mastery/blob/main/Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import BertModel, AutoTokenizer
from scipy.spatial.distance import cosine

In [None]:
MODEL_NAME = 'bert-base-cased'

In [None]:
model = BertModel.from_pretrained(MODEL_NAME)
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
text = 'Tokenize me this please'

In [None]:
encoded_inputs = tokenizer(text)
encoded_inputs

{'input_ids': [101, 1706, 6378, 3708, 1143, 1142, 4268, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
encoded_inputs = tokenizer(text, return_tensors='pt')
encoded_inputs

{'input_ids': tensor([[ 101, 1706, 6378, 3708, 1143, 1142, 4268,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
output = model(**encoded_inputs)

In [None]:
last_hidden_state = output.last_hidden_state
pooler_output = output.pooler_output
last_hidden_state.shape, pooler_output.shape

(torch.Size([1, 8, 768]), torch.Size([1, 768]))

In [None]:
def predict(text):
  encoded_inputs = tokenizer(text, return_tensors='pt')
  return model(**encoded_inputs)[0]

In [None]:
sentence1 = 'There was a fly drinking from my soup'
sentence2 = 'To become a commercial pilot, he had to fly for 1500 hours.'

In [None]:
tokens1 = tokenizer.tokenize(sentence1)
tokens2 = tokenizer.tokenize(sentence2)

In [None]:
tokens1

['There', 'was', 'a', 'fly', 'drinking', 'from', 'my', 'soup']

In [None]:
tokens2

['To',
 'become',
 'a',
 'commercial',
 'pilot',
 ',',
 'he',
 'had',
 'to',
 'fly',
 'for',
 '1500',
 'hours',
 '.']

In [None]:
out1 = predict(sentence1)
out2 = predict(sentence2)

In [None]:
tokens1.index('fly'), tokens2.index('fly')

(3, 9)

In [None]:
emb1 = out1[0:, tokens1.index('fly'), :].detach()
emb2 = out2[0:, tokens2.index('fly'), :].detach()
emb1.shape, emb2.shape

(torch.Size([1, 768]), torch.Size([1, 768]))

In [None]:
cosine(emb1.flatten(), emb2.flatten())

0.40477830171585083