**Importing useful libraries with python**

In [None]:
import numpy as np

**Custom function for the jaccard similarity metric between two lists of words.**

In [None]:
# Custom function for jaccard similarity
def jaccard_similarity(list1, list2):
  """ returns the jaccard similarity between two encodings """
  intersection_cardinality = len(set.intersection(*[set(list1), set(list2)]))
  union_cardinality = len(set.union(*[set(list1), set(list2)]))
  return intersection_cardinality/float(union_cardinality) if union_cardinality != 0 else 0

**Custom function for the cosine similarity metric between two encodings.**

In [None]:
# Custom function for cosine similarity
def cosine_similarity(encoding1, encoding2):
  return np.dot(encoding1, encoding2) / (np.linalg.norm(encoding1) * np.linalg.norm(encoding2)) if np.linalg.norm(encoding1) * np.linalg.norm(encoding2) != 0 else 0

**Installation of the library of Sentence Transformers and loading of their models.**

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
sentences = ["The bottle is empty",
"There is nothing in the bottle", "The bottle is empty empty", "David Beckham was playing at Manchester United", "Rolling Stones is a very-well known group playing rock music"]
similarity_scores = []
for sentence in sentences:
    similarity_scores.append(similarity_model.encode(sentence.lower()))
print(sentences[0],sentences[1],sentences[2],sentences[3],sentences[4])
print("Cosine similarity score between the first two sentences:", cosine_similarity(similarity_scores[0],similarity_scores[1]))
print("Jaccard similarity score between the first two sentences:", jaccard_similarity(sentences[0].lower().split(),sentences[1].lower().split()))
print("Cosine similarity score between the second and third sentences:", cosine_similarity(similarity_scores[1],similarity_scores[2]))
print("Jaccard similarity score between the second and third sentences:", jaccard_similarity(sentences[1].lower().split(),sentences[2].lower().split()))
print("Cosine similarity score between the fourth and fifth sentences:", cosine_similarity(similarity_scores[3],similarity_scores[4]))
print("Jaccard similarity score between the fourth and fifth sentences:", jaccard_similarity(sentences[3].lower().split(),sentences[4].lower().split()))

The bottle is empty There is nothing in the bottle The bottle is empty empty David Beckham was playing at Manchester United Rolling Stones is a very-well known group playing rock music
Cosine similarity score between the first two sentences: 0.6590526
Jaccard similarity score between the first two sentences: 0.42857142857142855
Cosine similarity score between the second and third sentences: 0.63779795
Jaccard similarity score between the second and third sentences: 0.42857142857142855
Cosine similarity score between the fourth and fifth sentences: 0.1868724
Jaccard similarity score between the fourth and fifth sentences: 0.0625
