# Transformer Sentence Embedding

In [1]:
from transformers import AutoModel, AutoTokenizer, pipeline
import torch

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "today I solved some excellent algorithmic problems"

## List All Tokens

In [3]:
encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

tokens = encoded.input_ids.tolist()[0]

for i, token_id in enumerate(tokens):
    decoded_token = tokenizer.decode(token_id)
    print(f"Token {i + 1}: {decoded_token}")

Token 1: [CLS]
Token 2: today
Token 3: i
Token 4: solved
Token 5: some
Token 6: excellent
Token 7: algorithm
Token 8: ##ic
Token 9: problems
Token 10: [SEP]

## Token IDs

In [4]:
print(encoded['input_ids'])
print(encoded['attention_mask'])

tensor([[  101,  2651,  1045, 13332,  2070,  6581,  9896,  2594,  3471,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## Executing the Model

In [5]:
output = model(**encoded)
print(output.last_hidden_state.shape)
print(output.last_hidden_state)

torch.Size([1, 10, 384])
tensor([[[-0.1642,  0.4660,  0.0489,  ...,  0.2386, -0.0663, -0.0930],
         [-0.5260,  0.8277,  0.7240,  ..., -0.7451, -1.1621,  1.1199],
         [ 0.1473,  0.2505,  0.1579,  ..., -0.2087, -1.1386, -0.6368],
         ...,
         [-0.5978,  0.6489, -0.2759,  ...,  0.2102, -0.1217, -0.2887],
         [-0.8411,  0.7750,  0.2105,  ...,  1.0205, -0.1044,  0.5411],
         [-0.3763,  0.5162, -0.3255,  ...,  0.2374, -0.0547, -0.1911]]],
       grad_fn=<NativeLayerNormBackward0>)

## Mean Pooling (Manually)

Since the attention mask consists entirely of `1`s (no padding tokens),
we can safely compute the simple mean over all token embeddings. If the
attention mask contains `0`s (indicating padding), a weighted mean that
accounts for valid tokens is required instead.

In [6]:
assert torch.all(encoded['attention_mask'] == 1)

In [7]:
manual_pooling = output.last_hidden_state.mean(dim=0)
print(manual_pooling)

tensor([[-0.1642,  0.4660,  0.0489,  ...,  0.2386, -0.0663, -0.0930],
        [-0.5260,  0.8277,  0.7240,  ..., -0.7451, -1.1621,  1.1199],
        [ 0.1473,  0.2505,  0.1579,  ..., -0.2087, -1.1386, -0.6368],
        ...,
        [-0.5978,  0.6489, -0.2759,  ...,  0.2102, -0.1217, -0.2887],
        [-0.8411,  0.7750,  0.2105,  ...,  1.0205, -0.1044,  0.5411],
        [-0.3763,  0.5162, -0.3255,  ...,  0.2374, -0.0547, -0.1911]],
       grad_fn=<MeanBackward1>)

## Calculate Pooling Using Library

In [8]:
extractor = pipeline("feature-extraction", model=model_name)
pooling = torch.tensor(extractor(text))
print(pooling)

Device set to use cpu

tensor([[[-0.1642,  0.4660,  0.0489,  ...,  0.2386, -0.0663, -0.0930],
         [-0.5260,  0.8277,  0.7240,  ..., -0.7451, -1.1621,  1.1199],
         [ 0.1473,  0.2505,  0.1579,  ..., -0.2087, -1.1386, -0.6368],
         ...,
         [-0.5978,  0.6489, -0.2759,  ...,  0.2102, -0.1217, -0.2887],
         [-0.8411,  0.7750,  0.2105,  ...,  1.0205, -0.1044,  0.5411],
         [-0.3763,  0.5162, -0.3255,  ...,  0.2374, -0.0547, -0.1911]]])

## Compare Both Values

In [9]:
torch.allclose(manual_pooling, pooling, rtol=1e-5, atol=1e-8)

True