In [96]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


sentences = ["This is an example sentence", "Each sentence is converted to to to to"]

# Using Sentence Transformers

In [119]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(sentences)
print(embeddings.mean(1), embeddings.std(1))

[-6.5167456e-05 -1.3562296e-04] [0.03608433 0.03608414]


# Using Transformers (We use only last_hidden_state)

In [120]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    model_output = model(**encoded_input)
print(model_output.keys())
print(f"last_hidden_state size: {model_output.last_hidden_state.shape}")
print(f"pooler_output size: {model_output.pooler_output.shape}")

odict_keys(['last_hidden_state', 'pooler_output'])
last_hidden_state size: torch.Size([2, 10, 768])
pooler_output size: torch.Size([2, 768])


In [121]:
# Now we need to convert the embeddings to sentence embeddings
# We need the last_hidden_state along with the attention mask
token_embeddings = model_output[0]
print(f"token embeddings (or last_hidden_state) shape: {token_embeddings.shape}")

attention_mask = encoded_input["attention_mask"]
print(f"attention mask:\n{attention_mask}")
print(f"attention mask shape: {attention_mask.shape}")
# We need to expand the attention mask
unsqueezed_attention_mask = attention_mask.unsqueeze(-1)
print(f"unsqueezed attention mask shape: {unsqueezed_attention_mask.shape}")
# Now expand the token embeddings into the same shape of the token embeddings
input_mask_expanded = unsqueezed_attention_mask.expand(token_embeddings.shape)
print(f"expanded token embeddings shape: {input_mask_expanded.shape}")
print(f"input_mask_expanded[0][0] should be all ones: {sum(input_mask_expanded[0][9])}")
print(
    f"But input_mask_expanded[0][9] should be all zeros: {sum(input_mask_expanded[0][9])}"
)
print(
    f"But input_mask_expanded[1][9] should be all ones: {sum(input_mask_expanded[1][9])}"
)
# Now sum the embeddings over dim 1 (mask them by attention mask)
summed_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
print(f"summed embeddings shape: {summed_embeddings.shape}")

# Compute mean by dividing by the number of tokens (real tokens not padded)
num_real_tokens = torch.clamp(
    input_mask_expanded.sum(1), min=1e-9
)  # We used torch.clamp to avoid division by zero
print(f"num_real_tokens shape: {num_real_tokens.shape}")

# Divide
mean_pooled_embeddings = summed_embeddings / num_real_tokens
print(f"mean_pooled_embeddings shape: {mean_pooled_embeddings.shape}")

# L2 Normalization
normalized_embeddings = F.normalize(mean_pooled_embeddings, p=2, dim=1)

token embeddings (or last_hidden_state) shape: torch.Size([2, 10, 768])
attention mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
attention mask shape: torch.Size([2, 10])
unsqueezed attention mask shape: torch.Size([2, 10, 1])
expanded token embeddings shape: torch.Size([2, 10, 768])
input_mask_expanded[0][0] should be all ones: 0
But input_mask_expanded[0][9] should be all zeros: 0
But input_mask_expanded[1][9] should be all ones: 768
summed embeddings shape: torch.Size([2, 768])
num_real_tokens shape: torch.Size([2, 768])
mean_pooled_embeddings shape: torch.Size([2, 768])


In [122]:
print(normalized_embeddings.mean(1), normalized_embeddings.std(1))

tensor([-6.5168e-05, -1.3562e-04]) tensor([0.0361, 0.0361])


## Utils

### Normalization

In [None]:
# L2 Normalization
sample_arr = torch.tensor([[1, 2, 3, 4], [4, 5, 6, 7]]).float()
print(f"Shape of sample_arr: {sample_arr.shape}")
L2_Normalized = F.normalize(sample_arr, dim=1, p=2)
print(f"L2 Norm: {L2_Normalized}")

# Do it manually:
L2_norm = torch.norm(sample_arr, dim=1, p=2)
print(f"L2 Norm itself: {L2_norm}")
# divide by it:
print(
    f"Divided by L2 Norm: {sample_arr / L2_norm.unsqueeze(-1).expand(sample_arr.shape)}"
)

# You can compute the norm itself manually:
l2_norm_manually = torch.sqrt(torch.sum(sample_arr**2, dim=1))
print(f"L2 Norm manually: {l2_norm_manually}")
print(
    "Divided by L2 Norm manually: ",
    sample_arr / l2_norm_manually.unsqueeze(-1).expand(sample_arr.shape),
)

Shape of sample_arr: torch.Size([2, 4])
L2 Norm: tensor([[0.1826, 0.3651, 0.5477, 0.7303],
        [0.3563, 0.4454, 0.5345, 0.6236]])
L2 Norm itself: tensor([ 5.4772, 11.2250])
Divided by L2 Norm: tensor([[0.1826, 0.3651, 0.5477, 0.7303],
        [0.3563, 0.4454, 0.5345, 0.6236]])
L2 Norm manually: tensor([ 5.4772, 11.2250])
Divided by L2 Norm manually:  tensor([[0.1826, 0.3651, 0.5477, 0.7303],
        [0.3563, 0.4454, 0.5345, 0.6236]])


# Sentence Classification

In [117]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=10
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [118]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [84]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         