# Transformer Sentence Embedding

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
torch.set_printoptions(sci_mode=False, edgeitems=10)

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "today I solved some excellent algorithmic problems"

## List All Tokens

In [3]:
encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

tokens = encoded.input_ids.tolist()[0]

for i, token_id in enumerate(tokens):
    decoded_token = tokenizer.decode(token_id)
    print(f"Token {i + 1}: {decoded_token}")

Token 1: [CLS]
Token 2: today
Token 3: i
Token 4: solved
Token 5: some
Token 6: excellent
Token 7: algorithm
Token 8: ##ic
Token 9: problems
Token 10: [SEP]

## Token IDs

In [4]:
print(encoded['input_ids'])
print(encoded['attention_mask'])

tensor([[  101,  2651,  1045, 13332,  2070,  6581,  9896,  2594,  3471,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## Initial Token Embeddings (Pre-Transformer)

In [5]:
initial_embeddings = model.embeddings(encoded.input_ids)
print(initial_embeddings.shape)
print(initial_embeddings)

torch.Size([1, 10, 384])
tensor([[[    -0.1766,     -0.0482,      0.0377,     -0.0157,      0.0063,
              -0.0312,     -0.0682,     -0.0068,      0.0010,      0.1356,
           ...,      0.0268,     -0.0989,     -0.0329,     -0.1181,
              -0.0277,      0.0379,      0.1696,      0.0310,      0.1154,
              -0.2001],
         [     0.1899,      0.4778,     -0.6466,     -0.0620,      0.4477,
              -0.2278,      0.5765,      0.1068,      0.0290,      0.1077,
           ...,      0.0714,      0.0220,     -0.2712,      0.4672,
               0.0867,     -0.3027,     -0.8269,      0.0499,     -0.5361,
               0.4469],
         [    -0.2715,     -0.5152,      0.1800,      0.1759,      0.1213,
               0.6639,      1.2929,      0.4178,     -0.2799,      0.3029,
           ...,      0.6342,      0.6704,     -0.0137,      0.5679,
               0.1162,     -0.0806,      1.1190,     -0.5518,     -0.7233,
              -0.7180],
         [    -0.6097,  

## Processing Embeddings with the Transformer

In [6]:
output = model(**encoded)
print(output.last_hidden_state.shape)
print(output.last_hidden_state)

torch.Size([1, 10, 384])
tensor([[[    -0.1642,      0.4660,      0.0489,     -0.4458,     -0.1046,
              -0.4053,     -0.0664,     -0.2672,     -0.6509,     -0.0402,
           ...,      0.2549,     -0.1511,      0.0672,     -0.3342,
               0.0627,      0.1672,     -0.1306,      0.2386,     -0.0663,
              -0.0930],
         [    -0.5260,      0.8277,      0.7240,      0.0294,      0.4266,
              -0.5625,      0.5401,      0.3576,     -1.2588,     -0.1127,
           ...,     -0.1882,     -0.3274,     -0.4097,     -0.6994,
              -0.2568,      0.0601,     -0.0381,     -0.7451,     -1.1621,
               1.1199],
         [     0.1473,      0.2505,      0.1579,     -0.7921,      0.1306,
              -0.9031,      1.2293,      0.5582,     -0.9292,     -0.2025,
           ...,      0.7037,     -0.0821,      0.1968,     -0.4797,
              -0.1288,      0.1652,      1.1664,     -0.2087,     -1.1386,
              -0.6368],
         [    -0.4840,  

## Mean Pooling

Since the attention mask consists entirely of `1`s (no padding tokens),
we can safely compute the simple mean over all token embeddings. If the
attention mask contains `0`s (indicating padding), a weighted mean that
accounts for valid tokens is required instead.

In [7]:
assert torch.all(encoded['attention_mask'] == 1)

In [8]:
pooling = output.last_hidden_state.mean(dim=1)
print(pooling)

tensor([[-0.4843,  0.5799,  0.1977, -0.4021, -0.1579, -0.6034,  0.2125, -0.1393,
         -0.8927,  0.0554,  ...,  0.4431, -0.2075,  0.1338, -0.4599,  0.0789,
          0.2508,  0.1400,  0.2352, -0.1897,  0.0249]],
       grad_fn=<MeanBackward1>)