# Visualising large language model embeddings with context

In a <a href="https://x.com/augustasmac/status/1775264780028367173?s=20" target="_blank">recent post</a>...

# Imports

Toggle cells below if you want to see what imports are being made.

In [1]:
#|code-fold: true
%load_ext autoreload
%autoreload 2

In [2]:
#|code-fold: true
import plotly.graph_objects as go
import torch
from transformers import AutoModel, AutoTokenizer

# Ensures we can render plotly plots with quarto
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [3]:
model_name = "google-bert/bert-base-cased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
sentence_v = "your data removal request has been reviewed and concluded"
sentence_u = "the sensitive personal information has been deleted"
sentence_k = "she has been a sensitive person"

sentence_v_tokenized = tokenizer(sentence_v, return_tensors="pt")
sentence_u_tokenized = tokenizer(sentence_u, return_tensors="pt")
sentence_k_tokenized = tokenizer(sentence_k, return_tensors="pt")

print(sentence_v_tokenized["input_ids"].shape)
print(sentence_u_tokenized["input_ids"].shape)
print(sentence_k_tokenized["input_ids"].shape)

torch.Size([1, 11])
torch.Size([1, 9])
torch.Size([1, 8])


Use `[CLS]` pooling according to [this](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1):

In [10]:
# @TODO: how do I do 1-grams here?
def compute_n_gram_representation(tokenized_sentence, n=1):
    representations = []

    for i in range(tokenized_sentence["input_ids"].shape[1] - n + 1):
        print(i)
    

sentence_v_0_gram = compute_n_gram_representation(sentence_v_tokenized)
# sentence_u_0_gram = embedding_vectors[sentence_u_tokenized, :].sum(axis=0)
# sentence_k_0_gram = embedding_vectors[sentence_k_tokenized, :].sum(axis=0)

# sentence_v_0_gram.shape, sentence_u_0_gram.shape, sentence_k_0_gram.shape
sentence_v_0_gram.shape

(torch.Size([768]), torch.Size([768]), torch.Size([768]))

In [23]:
dist_1 = torch.linalg.vector_norm(sentence_u_0_gram - sentence_k_0_gram)
dist_2 = torch.linalg.vector_norm(sentence_v_0_gram - sentence_u_0_gram)

print(dist_1, dist_2)
dist_1 < dist_2

tensor(2.5507) tensor(3.3529)


tensor(True)

In [20]:
# @TODO: is this right?
with torch.no_grad():
    tmp = model(**tokenizer(sentence_v, return_tensors="pt"))

tmp.last_hidden_state.shape, tmp.pooler_output.shape

(torch.Size([1, 11, 768]), torch.Size([1, 768]))

In [26]:
# We take 0th hidden vector because it corresponds to the [CLS] token
with torch.no_grad():
    sentence_v_transformers = model(**tokenizer(sentence_v, return_tensors="pt")).last_hidden_state[0, 0, :]
    sentence_u_transformers = model(**tokenizer(sentence_u, return_tensors="pt")).last_hidden_state[0, 0, :]
    sentence_k_transformers = model(**tokenizer(sentence_k, return_tensors="pt")).last_hidden_state[0, 0, :]

sentence_v_transformers.shape, sentence_u_transformers.shape, sentence_k_transformers.shape

(torch.Size([768]), torch.Size([768]), torch.Size([768]))

In [27]:
dist_1 = torch.linalg.vector_norm(sentence_u_transformers - sentence_k_transformers)
dist_2 = torch.linalg.vector_norm(sentence_v_transformers - sentence_u_transformers)

print(dist_1, dist_2)
dist_1 < dist_2

tensor(5.7802) tensor(5.6262)


tensor(False)

# Conclusion

Some conclusion