# Transformer Sentence Embedding

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
torch.set_printoptions(sci_mode=False)
torch.set_printoptions(edgeitems=10)

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "today I solved some excellent algorithmic problems"

## List All Tokens

In [3]:
encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

tokens = encoded.input_ids.tolist()[0]

for i, token_id in enumerate(tokens):
    decoded_token = tokenizer.decode(token_id)
    print(f"Token {i + 1}: {decoded_token}")

Token 1: [CLS]
Token 2: today
Token 3: i
Token 4: solved
Token 5: some
Token 6: excellent
Token 7: algorithm
Token 8: ##ic
Token 9: problems
Token 10: [SEP]

## Token IDs

In [4]:
print(encoded['input_ids'])
print(encoded['attention_mask'])

tensor([[  101,  2651,  1045, 13332,  2070,  6581,  9896,  2594,  3471,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## Initial Token Embeddings (Pre-Transformer)

In [5]:
initial_embeddings = model.embeddings(encoded.input_ids)
print(initial_embeddings.shape)
print(initial_embeddings)

torch.Size([1, 10, 384])
tensor([[[-1.7657e-01, -4.8236e-02,  3.7698e-02, -1.5718e-02,  6.3230e-03,
          -3.1231e-02, -6.8169e-02, -6.7792e-03,  9.7239e-04,  1.3556e-01,
           ...,  2.6754e-02, -9.8876e-02, -3.2883e-02, -1.1810e-01,
          -2.7678e-02,  3.7858e-02,  1.6958e-01,  3.1008e-02,  1.1538e-01,
          -2.0014e-01],
         [ 1.8995e-01,  4.7778e-01, -6.4656e-01, -6.1997e-02,  4.4774e-01,
          -2.2780e-01,  5.7647e-01,  1.0681e-01,  2.9041e-02,  1.0774e-01,
           ...,  7.1426e-02,  2.2001e-02, -2.7119e-01,  4.6715e-01,
           8.6744e-02, -3.0273e-01, -8.2688e-01,  4.9940e-02, -5.3608e-01,
           4.4687e-01],
         [-2.7146e-01, -5.1515e-01,  1.8001e-01,  1.7585e-01,  1.2130e-01,
           6.6390e-01,  1.2929e+00,  4.1779e-01, -2.7987e-01,  3.0289e-01,
           ...,  6.3417e-01,  6.7045e-01, -1.3659e-02,  5.6788e-01,
           1.1616e-01, -8.0553e-02,  1.1190e+00, -5.5179e-01, -7.2330e-01,
          -7.1802e-01],
         [-6.0968e-01, -

## Processing Embeddings with the Transformer

In [6]:
output = model(**encoded)
print(output.last_hidden_state.shape)
print(output.last_hidden_state)

torch.Size([1, 10, 384])
tensor([[[-1.6421e-01,  4.6603e-01,  4.8923e-02, -4.4581e-01, -1.0461e-01,
          -4.0528e-01, -6.6394e-02, -2.6723e-01, -6.5087e-01, -4.0242e-02,
           ...,  2.5490e-01, -1.5108e-01,  6.7190e-02, -3.3418e-01,
           6.2685e-02,  1.6719e-01, -1.3060e-01,  2.3860e-01, -6.6323e-02,
          -9.3048e-02],
         [-5.2605e-01,  8.2774e-01,  7.2402e-01,  2.9397e-02,  4.2663e-01,
          -5.6253e-01,  5.4013e-01,  3.5761e-01, -1.2588e+00, -1.1268e-01,
           ..., -1.8822e-01, -3.2735e-01, -4.0970e-01, -6.9944e-01,
          -2.5683e-01,  6.0083e-02, -3.8097e-02, -7.4507e-01, -1.1621e+00,
           1.1199e+00],
         [ 1.4730e-01,  2.5046e-01,  1.5792e-01, -7.9211e-01,  1.3057e-01,
          -9.0308e-01,  1.2293e+00,  5.5824e-01, -9.2916e-01, -2.0250e-01,
           ...,  7.0374e-01, -8.2109e-02,  1.9675e-01, -4.7967e-01,
          -1.2882e-01,  1.6525e-01,  1.1664e+00, -2.0873e-01, -1.1386e+00,
          -6.3680e-01],
         [-4.8400e-01,  

## Mean Pooling

Since the attention mask consists entirely of `1`s (no padding tokens),
we can safely compute the simple mean over all token embeddings. If the
attention mask contains `0`s (indicating padding), a weighted mean that
accounts for valid tokens is required instead.

In [7]:
assert torch.all(encoded['attention_mask'] == 1)

In [8]:
pooling = output.last_hidden_state.mean(dim=1)
print(pooling)

tensor([[-0.4843,  0.5799,  0.1977, -0.4021, -0.1579, -0.6034,  0.2125, -0.1393,
         -0.8927,  0.0554,  ...,  0.4431, -0.2075,  0.1338, -0.4599,  0.0789,
          0.2508,  0.1400,  0.2352, -0.1897,  0.0249]],
       grad_fn=<MeanBackward1>)