In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer


In [6]:
model=SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
docs=[
    "The dog jumped over the cat",
    "The cat jumped over the dog",
    "It is very warm today",
    "The cat is yellow and the dog is red",
]

#Now we want to make our data more complex by adding Ids to each row, the idea is we want to work towards the actual database
#It is not efficient to query the entire database get all of the text that is in there and then embedd all the text, all at once 
#It will be a lot efficient to get each row and then embedd it based of the row itself

In [8]:
#Lets treat these docs index as ids like
docs[0] #id=0

'The dog jumped over the cat'

In [16]:
documents=[]
for i,x in enumerate(docs):
    row={
        "index":i,
        "data": x
    }
    documents.append(row)
documents

[{'index': 0, 'data': 'The dog jumped over the cat'},
 {'index': 1, 'data': 'The cat jumped over the dog'},
 {'index': 2, 'data': 'It is very warm today'},
 {'index': 3, 'data': 'The cat is yellow and the dog is red'}]

In [22]:
documents_embeddings=[]
for x in documents:
    embed=model.encode(x['data'])
    final_data=(x['index'], embed)
    documents_embeddings.append(final_data)
documents_embeddings

[(0,
  array([ 5.44742681e-02,  3.71413939e-02,  7.23646730e-02,  7.01895282e-02,
         -5.60809486e-03, -2.30733934e-03, -3.77298258e-02,  1.51364673e-02,
          9.50727332e-03, -3.03880690e-04,  5.88888340e-02,  2.10996643e-02,
          6.14683237e-03, -6.37330264e-02,  1.38773406e-02, -9.31996852e-03,
         -1.19946659e-01, -2.11702078e-03,  6.35645986e-02, -2.08780952e-02,
         -2.80396491e-02, -3.85089107e-02,  2.66369572e-03, -5.36900461e-02,
         -2.24581920e-02,  4.27320749e-02, -6.99243546e-02, -6.33601397e-02,
          1.81462429e-02, -1.30238263e-02, -1.66832637e-02, -1.05505055e-02,
         -3.54123525e-02,  5.32196090e-02, -6.18179850e-02, -6.84991479e-02,
          6.55241907e-02, -1.77629961e-04,  4.45996635e-02,  1.20984375e-01,
         -4.50547561e-02,  1.97268557e-02, -1.27502801e-02,  1.16578233e-03,
         -3.26760113e-02,  6.10489696e-02, -1.04520563e-02, -6.91033751e-02,
          3.60625461e-02,  4.39831875e-02, -1.22471601e-02,  9.17075872

In [33]:
query="The cat is yellow and the dog is purple"
query_embedding=model.encode(query)


In [32]:
results=[]
for doc in documents_embeddings:
    doc_id=doc[0]
    doc_embeddings=doc[1]
    similarity=model.similarity(doc_embeddings, query_embedding)
    results.append((doc_id,similarity))
results

[(0, tensor([[0.5122]])),
 (1, tensor([[0.5164]])),
 (2, tensor([[0.0782]])),
 (3, tensor([[0.9099]]))]