In [1]:
# Uncomment the next line if you are using Google Colab
# !pip install sentence-transformers

In [2]:
# Import the SentenceTransformer class and the utility function from the sentence_transformers library.
from sentence_transformers import SentenceTransformer, util
# Use the all-MiniLM-L6-v2 model.
model = SentenceTransformer('all-MiniLM-L6-v2')

2023-12-01 11:18:54.647148: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Define a list of sentences to tokenize.
sentences = ["I love my dog.", "I love my family.", "My dog is a lab."]

In [4]:
# Tokenize the sentences with the model.
tokenized_documents = [model.tokenizer.tokenize(sentence) for sentence in sentences]

# Get the numerical embeddings for all sentences.
embeddings = model.encode(sentences)

# Print the embeddings
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Embedding {i+1}: {embeddings[i][0:10]}")
    print()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sentence 1: I love my dog.
Embedding 1: [-0.02576808  0.02608069  0.1241625   0.04487111 -0.04817442 -0.02053998
  0.07965573  0.03342929  0.07243421  0.01645789]

Sentence 2: I love my family.
Embedding 2: [-0.03410235  0.06401848  0.02506406 -0.02772886 -0.01960254 -0.00740547
  0.07258672  0.03830055  0.04374246 -0.04366305]

Sentence 3: My dog is a lab.
Embedding 3: [ 0.02185658 -0.00060472  0.09158508  0.03475889 -0.07893073 -0.0197475
 -0.04896924  0.03074822  0.01714233 -0.01477123]



In [5]:
# Generate the cosine similarity scores using the embeddings.
cosine_scores = util.cos_sim(embeddings, embeddings)
cosine_scores

tensor([[1.0000, 0.5691, 0.5745],
        [0.5691, 1.0000, 0.1410],
        [0.5745, 0.1410, 1.0000]])

In [6]:
#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order.
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
print(pairs)

[{'index': [0, 2], 'score': tensor(0.5745)}, {'index': [0, 1], 'score': tensor(0.5691)}, {'index': [1, 2], 'score': tensor(0.1410)}]


In [7]:
# Print out the pairs of sentences and their cosine similarity score.
for pair in pairs:
    i, j = pair['index']
    print(f" {sentences[i]} \t{sentences[j]} \t Score: {pair['score']:.4f}")

 I love my dog. 	My dog is a lab. 	 Score: 0.5745
 I love my dog. 	I love my family. 	 Score: 0.5691
 I love my family. 	My dog is a lab. 	 Score: 0.1410


In [8]:
# Create a DataFrame for the cosine similarity scores.
import pandas as pd

# Convert the cosine similarity matrix to a Pandas DataFrame.
similarity_df = pd.DataFrame(cosine_scores, columns=['Sentence 1', 'Sentence 2', 'Sentence 3'], index=['Sentence 1', 'Sentence 2', 'Sentence 3'])
similarity_df

Unnamed: 0,Sentence 1,Sentence 2,Sentence 3
Sentence 1,1.0,0.569144,0.574517
Sentence 2,0.569144,1.0,0.140978
Sentence 3,0.574517,0.140978,1.0
