# Visualising large language model embeddings with context

In a <a href="https://x.com/augustasmac/status/1775264780028367173?s=20" target="_blank">recent post</a>...

# Imports

Toggle cells below if you want to see what imports are being made.

In [43]:
#|code-fold: true
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
#|code-fold: true
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

# Ensures we can render plotly plots with quarto
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [78]:
def perform_distance_comparison(s1, s2, s3):
    euclidean_dist_1 = torch.linalg.vector_norm(s1 - s2).item()
    euclidean_dist_2 = torch.linalg.vector_norm(s1 - s3).item()

    print(euclidean_dist_1, euclidean_dist_2)
    print(f"|s1 - s2| < |s1 - s3| = {euclidean_dist_1 < euclidean_dist_2}")

    cosine_sim_1 = F.cosine_similarity(s1[None, :], s2[None, :])[0].item()
    cosine_sim_2 = F.cosine_similarity(s1[None, :], s3[None, :])[0].item()

    print(cosine_sim_1, cosine_sim_2)
    print(f"sim(s1, s2) > sim(s1, s3) = {cosine_sim_1 > cosine_sim_2}")


In [79]:
model_name = "google-bert/bert-base-cased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [80]:
sentence_1 = "sensitive information"
sentence_2 = "confidential details"
sentence_3 = "sensitive individual"

In [81]:
sentence_1_tokenized = tokenizer(sentence_1, return_tensors="pt")
sentence_2_tokenized = tokenizer(sentence_2, return_tensors="pt")
sentence_3_tokenized = tokenizer(sentence_3, return_tensors="pt")

print(sentence_1_tokenized["input_ids"].shape)
print(sentence_2_tokenized["input_ids"].shape)
print(sentence_3_tokenized["input_ids"].shape)

with torch.no_grad():
    sentence_1_transformers = model(**sentence_1_tokenized).last_hidden_state[0, 0, :]
    sentence_2_transformers = model(**sentence_2_tokenized).last_hidden_state[0, 0, :]
    sentence_3_transformers = model(**sentence_3_tokenized).last_hidden_state[0, 0, :]

sentence_1_transformers.shape, sentence_2_transformers.shape, sentence_3_transformers.shape

torch.Size([1, 4])
torch.Size([1, 4])
torch.Size([1, 4])


(torch.Size([768]), torch.Size([768]), torch.Size([768]))

In [82]:
perform_distance_comparison(
    sentence_1_transformers, sentence_2_transformers, sentence_3_transformers
)

4.9806365966796875 6.78804874420166
|s1 - s2| < |s1 - s3| = True
0.9552195072174072 0.9001082181930542
sim(s1, s2) > sim(s1, s3) = True


## Harder example

In [100]:
sentence_1 = "your data removal request has been reviewed and concluded"
sentence_2 = "the sensitive personal information has been deleted"
sentence_3 = "she has been a sensitive person"

Use `[CLS]` pooling according to [this](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1):

In [101]:
# @TODO: is this the best way to do n-grams?
def compute_n_gram_representation(sentence: str, model, n=1):
    representations = []

    words = sentence.split()
    for i in range(len(words) - n):
        tokenized_words = tokenizer(" ".join(words[i:i+n+1]), return_tensors="pt")

        with torch.no_grad():
            curr_repr = model(**tokenized_words).last_hidden_state[0, 0]
        
        representations.append(curr_repr)

    return torch.vstack(representations).T.mean(axis=1)


In [102]:
sentence_1_1_gram = compute_n_gram_representation(sentence_1, model, n=0)
sentence_2_1_gram = compute_n_gram_representation(sentence_2, model, n=0)
sentence_3_1_gram = compute_n_gram_representation(sentence_3, model, n=0)

sentence_1_1_gram.shape, sentence_2_1_gram.shape, sentence_3_1_gram.shape

(torch.Size([768]), torch.Size([768]), torch.Size([768]))

In [103]:
perform_distance_comparison(sentence_1_1_gram, sentence_2_1_gram, sentence_3_1_gram)

1.922633171081543 2.517957925796509
|s1 - s2| < |s1 - s3| = True
0.9925070405006409 0.9873220920562744
sim(s1, s2) > sim(s1, s3) = True


In [93]:
sentence_1_tokenized = tokenizer(sentence_1, return_tensors="pt")
sentence_2_tokenized = tokenizer(sentence_2, return_tensors="pt")
sentence_3_tokenized = tokenizer(sentence_3, return_tensors="pt")

print(sentence_1_tokenized["input_ids"].shape)
print(sentence_2_tokenized["input_ids"].shape)
print(sentence_3_tokenized["input_ids"].shape)

with torch.no_grad():
    sentence_1_transformers = model(**sentence_1_tokenized).last_hidden_state[0, 0, :]
    sentence_2_transformers = model(**sentence_2_tokenized).last_hidden_state[0, 0, :]
    sentence_3_transformers = model(**sentence_3_tokenized).last_hidden_state[0, 0, :]

sentence_1_transformers.shape, sentence_2_transformers.shape, sentence_3_transformers.shape

torch.Size([1, 11])
torch.Size([1, 9])
torch.Size([1, 8])


(torch.Size([768]), torch.Size([768]), torch.Size([768]))

In [94]:
perform_distance_comparison(
    sentence_1_transformers, sentence_2_transformers, sentence_3_transformers
)

5.626187324523926 7.529390335083008
|s1 - s2| < |s1 - s3| = True
0.940568208694458 0.8929167985916138
sim(s1, s2) > sim(s1, s3) = True


## Try the same with a text embedding model

In [52]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [54]:
sentence_v_1_gram = compute_n_gram_representation(sentence_v, model, n=1)
sentence_u_1_gram = compute_n_gram_representation(sentence_u, model, n=1)
sentence_k_1_gram = compute_n_gram_representation(sentence_k, model, n=1)

sentence_v_1_gram.shape, sentence_u_1_gram.shape, sentence_k_1_gram.shape

2
sensitive information
2
confidential details
2
sensitive individual


(torch.Size([1024]), torch.Size([1024]), torch.Size([1024]))

In [55]:
euclidean_dist_1 = torch.linalg.vector_norm(sentence_u_1_gram - sentence_k_1_gram).item()
euclidean_dist_2 = torch.linalg.vector_norm(sentence_v_1_gram - sentence_u_1_gram).item()

print(euclidean_dist_1, euclidean_dist_2)
print(euclidean_dist_1 < euclidean_dist_2)
# assert euclidean_dist_1 < euclidean_dist_2  # should be true

cosine_sim_1 = F.cosine_similarity(sentence_u_1_gram[None, :], sentence_k_1_gram[None, :])[0].item()
cosine_sim_2 = F.cosine_similarity(sentence_v_1_gram[None, :], sentence_u_1_gram[None, :])[0].item()

print(cosine_sim_1, cosine_sim_2)
print(cosine_sim_1 < cosine_sim_2)
# assert not cosine_sim_1 < cosine_sim_2  # should be false

13.8001070022583 8.948094367980957
False
0.6227388978004456 0.84317946434021
True


In [56]:
with torch.no_grad():
    sentence_v_transformers = model(**sentence_v_tokenized).last_hidden_state[0, 0, :]
    sentence_u_transformers = model(**sentence_u_tokenized).last_hidden_state[0, 0, :]
    sentence_k_transformers = model(**sentence_k_tokenized).last_hidden_state[0, 0, :]

sentence_v_transformers.shape, sentence_u_transformers.shape, sentence_k_transformers.shape

(torch.Size([1024]), torch.Size([1024]), torch.Size([1024]))

In [57]:
euclidean_dist_1 = torch.linalg.vector_norm(sentence_u_transformers - sentence_k_transformers).item()
euclidean_dist_2 = torch.linalg.vector_norm(sentence_v_transformers - sentence_u_transformers).item()

print(euclidean_dist_1, euclidean_dist_2)
print(euclidean_dist_1 < euclidean_dist_2)  # should be false, but turns out to be true

cosine_sim_1 = F.cosine_similarity(sentence_u_transformers[None, :], sentence_k_transformers[None, :])[0].item()
cosine_sim_2 = F.cosine_similarity(sentence_v_transformers[None, :], sentence_u_transformers[None, :])[0].item()

print(cosine_sim_1, cosine_sim_2)
print(cosine_sim_1 < cosine_sim_2)  # should be true, but turns out to be false

17.151620864868164 16.00449562072754
False
0.40056076645851135 0.4910770058631897
True


# Conclusion

Some conclusion