## Import Library

In [2]:
#pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
import numpy as np

ModuleNotFoundError: No module named 'sentence_transformers'

## Model Selection and Initialization

In [2]:
# List of models optimized for semantic textual similarity can be found at:
# https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0
model = SentenceTransformer('stsb-roberta-large')

## Calculate semantic similarity between two sentences

In [3]:
sentence1 = "I like Python because I can build AI applications"
sentence2 = "I like Python because I can do data analytics"

# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Sentence 1: I like Python because I can build AI applications
Sentence 2: I like Python because I can do data analytics
Similarity score: 0.8015280961990356


## Calculate semantic similarity between two lists of sentences

In [4]:
sentences1 = ["I like Python because I can build AI applications", "The cat sits on the ground","I always wanted to go to Japan, but I never had a chance.  Finally I went to Kyoto"]   
sentences2 = ["I like Python because I can do data analytics", "The cat walks on the sidewalk","This is a picture of a favorite place on Mount Koya, near Kobe"]

# encode list of sentences to get their embeddings
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

for i in range(len(sentences1)):
    for j in range(len(sentences2)):
        print("Sentence 1:", sentences1[i])
        print("Sentence 2:", sentences2[j])
        print("Similarity Score:", cosine_scores[i][j].item())
        print()

Sentence 1: I like Python because I can build AI applications
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.8015284538269043

Sentence 1: I like Python because I can build AI applications
Sentence 2: The cat walks on the sidewalk
Similarity Score: -0.03110978752374649

Sentence 1: I like Python because I can build AI applications
Sentence 2: This is a picture of a favorite place on Mount Koya, near Kobe
Similarity Score: 0.3189034163951874

Sentence 1: The cat sits on the ground
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.11328648030757904

Sentence 1: The cat sits on the ground
Sentence 2: The cat walks on the sidewalk
Similarity Score: 0.40381497144699097

Sentence 1: The cat sits on the ground
Sentence 2: This is a picture of a favorite place on Mount Koya, near Kobe
Similarity Score: 0.11604969948530197

Sentence 1: I always wanted to go to Japan, but I never had a chance.  Finally I went to Kyoto
Sentence 2: I like 

## Retrieve Top K most similar sentences from a corpus given a sentence

In [5]:
corpus = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk",
         "I always wanted to go to Japan, but I never had a chance.  Finally I went to Kyoto",
         "This is a picture of a favorite place on Mount Koya, near Kobe"]

# encode corpus to get corpus embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [6]:
sentence = "Saicho went to Mount Hiei near Kyoto"

# encode sentence to get sentence embeddings
sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [7]:
# top_k results to return
top_k=2

# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

print("Sentence:", sentence, "\n")
print("Top", top_k, "most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: Saicho went to Mount Hiei near Kyoto 

Top 2 most similar sentences in corpus:
This is a picture of a favorite place on Mount Koya, near Kobe (Score: 0.3764)
I always wanted to go to Japan, but I never had a chance.  Finally I went to Kyoto (Score: 0.2940)
