# Transformer Sentence Embedding

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "today I solved some excellent algorithmic problems"

## List All Tokens

In [3]:
encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

tokens = encoded.input_ids.tolist()[0]

for i, token_id in enumerate(tokens):
    decoded_token = tokenizer.decode(token_id)
    print(f"Token {i + 1}: {decoded_token}")

Token 1: [CLS]
Token 2: today
Token 3: i
Token 4: solved
Token 5: some
Token 6: excellent
Token 7: algorithm
Token 8: ##ic
Token 9: problems
Token 10: [SEP]

## Token IDs

In [4]:
print(encoded['input_ids'])
print(encoded['attention_mask'])

tensor([[  101,  2651,  1045, 13332,  2070,  6581,  9896,  2594,  3471,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## Initial Token Embeddings (Pre-Transformer)

In [5]:
initial_embeddings = model.embeddings(encoded.input_ids)
print(initial_embeddings.shape)
print(initial_embeddings)

torch.Size([1, 10, 384])
tensor([[[-0.1766, -0.0482,  0.0377,  ...,  0.0310,  0.1154, -0.2001],
         [ 0.1899,  0.4778, -0.6466,  ...,  0.0499, -0.5361,  0.4469],
         [-0.2715, -0.5152,  0.1800,  ..., -0.5518, -0.7233, -0.7180],
         ...,
         [-0.7215, -0.0884,  0.2669,  ...,  0.9165, -0.4877, -0.6912],
         [-0.5559, -0.1996, -0.2737,  ...,  0.8535, -0.2942,  0.7360],
         [ 0.2826,  0.1163, -0.2290,  ...,  0.1418, -0.0491, -0.1000]]],
       grad_fn=<NativeLayerNormBackward0>)

## Executing the Model

In [6]:
output = model(**encoded)
print(output.last_hidden_state.shape)
print(output.last_hidden_state)

torch.Size([1, 10, 384])
tensor([[[-0.1642,  0.4660,  0.0489,  ...,  0.2386, -0.0663, -0.0930],
         [-0.5260,  0.8277,  0.7240,  ..., -0.7451, -1.1621,  1.1199],
         [ 0.1473,  0.2505,  0.1579,  ..., -0.2087, -1.1386, -0.6368],
         ...,
         [-0.5978,  0.6489, -0.2759,  ...,  0.2102, -0.1217, -0.2887],
         [-0.8411,  0.7750,  0.2105,  ...,  1.0205, -0.1044,  0.5411],
         [-0.3763,  0.5162, -0.3255,  ...,  0.2374, -0.0547, -0.1911]]],
       grad_fn=<NativeLayerNormBackward0>)

## Mean Pooling

Since the attention mask consists entirely of `1`s (no padding tokens),
we can safely compute the simple mean over all token embeddings. If the
attention mask contains `0`s (indicating padding), a weighted mean that
accounts for valid tokens is required instead.

In [7]:
assert torch.all(encoded['attention_mask'] == 1)

In [8]:
manual_pooling = output.last_hidden_state.mean(dim=1)
print(manual_pooling.shape)
print(manual_pooling)

torch.Size([1, 384])
tensor([[-4.8428e-01,  5.7991e-01,  1.9769e-01, -4.0213e-01, -1.5788e-01,
         -6.0335e-01,  2.1247e-01, -1.3929e-01, -8.9269e-01,  5.5423e-02,
         -2.8129e-01,  2.2564e-01,  2.0014e-01,  6.2436e-01, -1.8120e-01,
          4.4146e-01, -5.3258e-01,  3.2513e-01, -2.6029e-01, -4.1456e-01,
         -8.3189e-01,  1.0587e-01, -3.7187e-01,  2.1000e-02, -1.7820e-02,
          4.4710e-01,  1.6586e-01, -6.2919e-01,  3.7707e-01, -2.2445e-01,
         -1.4096e-01,  2.3643e-01,  2.7513e-01,  3.1801e-02, -1.0134e-01,
          1.6924e-01, -1.3359e-01, -6.0268e-02, -1.0087e-01,  1.1035e-01,
         -1.0007e-01, -3.9838e-02,  3.1602e-02, -1.0582e-01,  1.3354e-01,
         -4.3479e-02, -2.6800e-01,  2.8919e-01,  2.9595e-01, -4.6990e-01,
         -6.8446e-01, -3.5652e-01, -4.8612e-01, -3.3313e-01, -1.5210e-01,
         -5.2247e-01,  3.6610e-01, -2.6288e-02,  2.5394e-01, -3.9058e-01,
          2.1569e-01, -1.2577e-01, -2.3336e-01,  2.8456e-02,  3.2298e-01,
          2.5926e