## Import Library

In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

## Model Selection and Initialization

In [2]:
# List of models optimized for semantic textual similarity can be found at:
# https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0
model = SentenceTransformer('stsb-roberta-large')

Downloading: 100%|██████████| 748/748 [00:00<00:00, 180kB/s]
Downloading: 100%|██████████| 3.92k/3.92k [00:00<00:00, 1.21MB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 566B/s]
Downloading: 100%|██████████| 674/674 [00:00<00:00, 467kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 26.0kB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 897kB/s] 
Downloading: 100%|██████████| 229/229 [00:00<00:00, 54.2kB/s]
Downloading: 100%|██████████| 1.42G/1.42G [00:25<00:00, 56.3MB/s]
Downloading: 100%|██████████| 52.0/52.0 [00:00<00:00, 6.84kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 104kB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.91MB/s]
Downloading: 100%|██████████| 1.17k/1.17k [00:00<00:00, 182kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 1.36MB/s]
Downloading: 100%|██████████| 191/191 [00:00<00:00, 28.0kB/s]


## Calculate semantic similarity between two sentences

In [5]:
sentence1 = "After an impressive game, the home team was unable to qualify."
sentence2 = "Soccer"

# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Sentence 1: After an impressive soccer game, the home team was unable to qualify.
Sentence 2: Soccer
Similarity score: 0.347704142332077


## Calculate semantic similarity between two lists of sentences

In [48]:
sentences1 = ["I like Python because I can build AI applications", "The cat sits on the ground"]   
sentences2 = ["I like Python because I can do data analytics", "The cat walks on the sidewalk"]

# encode list of sentences to get their embeddings
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

for i in range(len(sentences1)):
    for j in range(len(sentences2)):
        print("Sentence 1:", sentences1[i])
        print("Sentence 2:", sentences2[j])
        print("Similarity Score:", cosine_scores[i][j].item())
        print()

Sentence 1: I like Python because I can build AI applications
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.8188023567199707

Sentence 1: I like Python because I can build AI applications
Sentence 2: The cat walks on the sidewalk
Similarity Score: -0.06005367636680603

Sentence 1: The cat sits on the ground
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.12721936404705048

Sentence 1: The cat sits on the ground
Sentence 2: The cat walks on the sidewalk
Similarity Score: 0.4131842255592346



## Retrieve Top K most similar sentences from a corpus given a sentence

In [37]:
corpus = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk"]

# encode corpus to get corpus embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [38]:
sentence = "I like Javascript because I can build web applications"

# encode sentence to get sentence embeddings
sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [50]:
# top_k results to return
top_k=2

# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

print("Sentence:", sentence, "\n")
print("Top", top_k, "most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: I like Javascript because I can build web applications 

Top 2 most similar sentences in corpus:
I like Python because I can build AI applications (Score: 0.6253)
I like Python because I can do data analytics (Score: 0.5348)


---

# Bert

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading: 100%|██████████| 391/391 [00:00<00:00, 205kB/s]
Downloading: 100%|██████████| 3.95k/3.95k [00:00<00:00, 809kB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 451B/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 102kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 20.2kB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 54.1kB/s]
Downloading: 100%|██████████| 438M/438M [00:08<00:00, 54.0MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 7.01kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 15.3kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 741kB/s]  
Downloading: 100%|██████████| 399/399 [00:00<00:00, 100kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 661kB/s] 
Downloading: 100%|██████████| 190/190 [00:00<00:00, 41.9kB/s]


In [16]:
sentence1 = "After an impressive game, the home team was unable to qualify."
sentence2 = "Soccer"

sentences = [sentence1, sentence2]

In [17]:
sentence_embeddings = model.encode(sentences)

Ignored unknown kwarg option direction


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

array([[0.18241435]], dtype=float32)

---

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch

In [21]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

Downloading: 100%|██████████| 399/399 [00:00<00:00, 368kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 176kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 482kB/s]  
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 810kB/s]  
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 343B/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 18.4kB/s]
Downloading: 100%|██████████| 418M/418M [00:06<00:00, 64.2MB/s] 


In [22]:
sentence1 = "After an impressive game, the home team was unable to qualify."
sentence2 = "Soccer"

sentences = [sentence1, sentence2]

# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction


In [23]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [24]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.1639,  0.0275,  1.2470,  ..., -0.2369, -0.1089,  0.2385],
         [-0.1594, -0.1702,  1.2940,  ..., -0.3311, -0.3406, -0.2195],
         [-0.2658, -0.3155,  1.0049,  ..., -0.0704, -0.3765,  0.1148],
         ...,
         [-0.3088,  0.0668,  0.6455,  ..., -0.0346, -0.2720, -0.1935],
         [-0.4527,  0.0374,  0.7982,  ..., -0.0596, -0.4605, -0.0555],
         [-0.4068, -0.0271,  0.8607,  ...,  0.0413, -0.4610, -0.0447]],

        [[-0.1715, -0.3728,  1.0104,  ...,  0.7672,  0.1038,  0.6650],
         [ 0.0963, -0.2661,  0.7266,  ...,  0.3612, -0.2968,  0.5426],
         [ 0.2132, -0.2210,  1.4766,  ...,  1.0731, -0.2669,  0.4364],
         ...,
         [-0.2084, -0.8132,  0.9243,  ...,  0.7807,  0.0150,  0.5773],
         [-0.2169, -0.7170,  0.9179,  ...,  0.6110, -0.0121,  0.7616],
         [-0.1951, -0.7552,  0.9375,  ...,  0.6177, -0.0341,  0.7321]]],
       grad_fn=<NativeLayerNormBackward0>)

In [25]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([2, 128])

In [28]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([2, 128, 768])

In [29]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([2, 128, 768])

In [31]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([2, 768])

In [32]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([2, 768])

In [34]:
mean_pooled = summed / summed_mask

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.18241435]], dtype=float32)