<a href="https://www.kaggle.com/code/aisuko/sentence-embeddings-with-transformers?scriptVersionId=167289770" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Overview

In [Semantic Search](https://www.kaggle.com/code/aisuko/semantic-search). We use `sentence-transformers` compute the embeddings of our sentences. In this notebook, let use the Transformers without installing `sentence-transformers`.

In [1]:
%%capture
!pip install transformers==4.35.2

In [2]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ['MODEL_NAME']='sentence-transformers/all-MiniLM-L6-v2'

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading the tokenizer

We load the tokenizer without any padding.

In [3]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
tokenizer

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Loading the model

In [4]:
from transformers import AutoModel

model=AutoModel.from_pretrained(os.getenv('MODEL_NAME'))
model.max_seq_length=200
model.to('cuda')

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

# Converting the input to tokens

In [5]:
sentences=[
    "Which sports venue is a historic landmark in Melbourne?",
    "What are some of the events hosted in Melbourne throughout the year?"
]

encoded_input=tokenizer(sentences, padding=True, truncation=True, max_length=200, return_tensors='pt')
encoded_input.to('cuda')
encoded_input

{'input_ids': tensor([[ 101, 2029, 2998, 6891, 2003, 1037, 3181, 8637, 1999, 4940, 1029,  102,
            0,    0,    0],
        [ 101, 2054, 2024, 2070, 1997, 1996, 2824, 4354, 1999, 4940, 2802, 1996,
         2095, 1029,  102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

# Computing token embeddings

In [6]:
import torch

with torch.no_grad():
    model_output=model(**encoded_input)

## Mean Pooling

Take attention mask into account for correct averaging.

In [7]:
def mean_pooling(model_output, attention_mask):
    # The first element of model_output contains all token embeddings
    token_embeddings=model_output[0]
    input_mask_expanded=attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings=torch.sum(token_embeddings*input_mask_expanded,1)
    sum_mask=torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings/sum_mask

sentence_embeddings=mean_pooling(model_output,encoded_input['attention_mask'])
sentence_embeddings

tensor([[ 6.9722e-01,  3.1283e-01, -3.0573e-01, -1.4104e-01, -3.0273e-01,
          7.5176e-01, -3.6984e-01, -8.4478e-02, -4.3709e-02,  4.6672e-01,
         -3.7058e-01, -2.4390e-01, -2.2810e-01,  3.0399e-01,  3.8290e-01,
         -1.7547e-01,  1.1959e-01,  1.7013e-01,  4.9977e-01, -1.6558e-01,
         -1.4376e-01, -4.6532e-02, -1.0919e-01,  6.6810e-02, -2.1975e-01,
          5.8006e-01, -8.9207e-02,  8.7130e-01,  1.7534e-01, -2.1303e-01,
         -4.1316e-02, -1.7387e-01,  8.4670e-02, -2.7562e-02, -4.7840e-03,
          1.5939e-01, -3.4070e-01, -2.1452e-01,  5.3864e-02, -1.0746e-01,
          1.4123e-01,  2.2867e-01,  3.8490e-01,  1.1604e-01,  1.4238e-01,
         -1.1855e-01,  2.4458e-01,  4.9221e-03,  2.4361e-01,  3.2971e-02,
          1.6837e-01,  3.5076e-01,  3.3877e-01, -2.7871e-01,  2.7278e-01,
          2.2372e-01, -2.6900e-01, -3.0673e-01, -8.8388e-02, -3.2604e-01,
          5.2362e-01,  6.5078e-02, -4.6224e-01, -7.3499e-02, -6.0201e-01,
         -2.4137e-01, -6.5647e-01,  1.