# Transformer Sentence Embedding

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
torch.set_printoptions(sci_mode=False, edgeitems=5)

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "today I solved some excellent algorithmic problems"

## List All Tokens

In [3]:
encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

tokens = encoded.input_ids.tolist()[0]

for i, token_id in enumerate(tokens):
    decoded_token = tokenizer.decode(token_id)
    print(f"Token {i + 1}: {decoded_token}")

Token 1: [CLS]
Token 2: today
Token 3: i
Token 4: solved
Token 5: some
Token 6: excellent
Token 7: algorithm
Token 8: ##ic
Token 9: problems
Token 10: [SEP]

## Token IDs

In [4]:
print(encoded['input_ids'])
print(encoded['attention_mask'])

tensor([[  101,  2651,  1045, 13332,  2070,  6581,  9896,  2594,  3471,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## Initial Token Embeddings (Pre-Transformer)

In [5]:
initial_embeddings = model.embeddings(encoded.input_ids)
print(initial_embeddings.shape)
print(initial_embeddings)

torch.Size([1, 10, 384])
tensor([[[-0.1766, -0.0482,  0.0377, -0.0157,  0.0063,  ...,  0.0379,  0.1696,  0.0310,  0.1154, -0.2001],
         [ 0.1899,  0.4778, -0.6466, -0.0620,  0.4477,  ..., -0.3027, -0.8269,  0.0499, -0.5361,  0.4469],
         [-0.2715, -0.5152,  0.1800,  0.1759,  0.1213,  ..., -0.0806,  1.1190, -0.5518, -0.7233, -0.7180],
         [-0.6097, -0.4077, -0.3073,  0.0387, -0.2921,  ..., -0.1415, -0.0782, -0.0844,  0.1180,  0.1815],
         [-0.1910, -0.3607,  0.8348,  0.0936,  0.3426,  ..., -0.2924, -1.2255,  0.4598,  0.1018,  1.4967],
         [-0.0791,  0.8314,  0.6794,  0.6199,  0.1778,  ..., -0.2922,  0.6357,  1.1416, -0.3689, -0.1879],
         [-0.6089, -0.2179,  0.0612,  0.7646, -0.4000,  ...,  0.7202, -0.5597,  0.1309,  0.6490, -0.4538],
         [-0.7215, -0.0884,  0.2669, -0.2393, -0.1560,  ...,  0.0523, -0.2557,  0.9165, -0.4877, -0.6912],
         [-0.5559, -0.1996, -0.2737,  0.4928, -0.2631,  ..., -0.4326, -0.2903,  0.8535, -0.2942,  0.7360],
         [ 0

## Processing Embeddings with the Transformer

In [6]:
output = model(**encoded)
print(output.last_hidden_state.shape)
print(output.last_hidden_state)

torch.Size([1, 10, 384])
tensor([[[    -0.1642,      0.4660,      0.0489,     -0.4458,     -0.1046,  ...,      0.1672,     -0.1306,      0.2386,     -0.0663,     -0.0930],
         [    -0.5260,      0.8277,      0.7240,      0.0294,      0.4266,  ...,      0.0601,     -0.0381,     -0.7451,     -1.1621,      1.1199],
         [     0.1473,      0.2505,      0.1579,     -0.7921,      0.1306,  ...,      0.1652,      1.1664,     -0.2087,     -1.1386,     -0.6368],
         [    -0.4840,      1.1230,      0.6628,     -0.0099,     -0.5324,  ...,      0.2173,     -0.2530,      0.8590,      0.3046,     -0.1156],
         [    -0.5985,      0.2569,      0.3128,     -0.2144,      0.1356,  ...,      0.2941,      0.0237,      0.1431,     -0.1129,      0.7390],
         [    -0.4394,      0.5312,      0.4112,     -0.3248,     -0.4767,  ...,      0.6671,      0.8386,      0.8715,     -0.3522,      0.3678],
         [    -0.9628,      0.4037,      0.0501,     -0.8665,     -0.3116,  ...,     -0.2301,

## Mean Pooling

Since the attention mask consists entirely of `1`s (no padding tokens),
we can safely compute the simple mean over all token embeddings. If the
attention mask contains `0`s (indicating padding), a weighted mean that
accounts for valid tokens is required instead.

In [7]:
assert torch.all(encoded['attention_mask'] == 1)

In [8]:
pooling = output.last_hidden_state.mean(dim=1)
print(pooling.shape)
print(pooling)

torch.Size([1, 384])
tensor([[-0.4843,  0.5799,  0.1977, -0.4021, -0.1579,  ...,  0.2508,  0.1400,  0.2352, -0.1897,  0.0249]], grad_fn=<MeanBackward1>)