In [1]:
from fastembed import TextEmbedding
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


1. Embedding the query

In [2]:
# Initialize the text embedding model with the specified pre-trained model
model = TextEmbedding(model='jinaai/jina-embeddings-v2-small-en')

# Define the input query to be embedded
query = 'I just discovered the course. Can I join now?'

# Generate the embedding for the query and extract the first (and only) embedding from the list
embedding1 = list(model.embed([query]))[0]

# Print the shape of the embedding vector
print(embedding1.shape)

# Print the minimum value in the embedding vector
print(np.min(embedding1))

(384,)
-0.28601003


In [3]:
# Verify the normalization of the embedding to be 1.0
print(np.linalg.norm(embedding1))

# Check the cosine similarity of same embedding
print(embedding1.dot(embedding1))

1.0
1.0


2. Cosine similarity with another vector

In [4]:
doc = 'Can I still join the course after the start date?'

# Generate the embedding for the query and extract the first (and only) embedding from the list
embedding2 = list(model.embed([doc]))[0]

# Check the cosine similarity between the two embeddings
print(round(embedding1.dot(embedding2), 1))

0.9


In [5]:
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

# Generate the embedding for the documents and extract the first (and only) embedding from the list
embeddings3 = list(model.embed([doc['text'] for doc in documents]))

# Print the shape of the embedding vector
print(embeddings3[0].shape)

(384,)


3. Ranking by cosine

In [6]:
# Put the embeddings3 calculated in for each document to a vector V
V1 = np.array([embedding for embedding in embeddings3])

# Calculate the cosine similarity between each embedding and the query
cosine_similarities1 = V1.dot(embedding1)

# Find the index of the maximum cosine similarity
max_index1 = np.argmax(cosine_similarities1)

# Print the document with the maximum cosine similarity
print("The index of the maximum cosine similarity is: ", max_index1)
print("The maximum cosine similarity is: ", cosine_similarities1[max_index1])
print("The most similar document is: ", documents[max_index1]['text'])

The index of the maximum cosine similarity is:  2
The maximum cosine similarity is:  0.69218606
The most similar document is:  The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.


In [7]:
# For each doc in documents, concatenate the question and text into a single string
full_text = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Generate the embedding for the full text and extract the first (and only) embedding from the list
embeddings4 = list(model.embed(full_text))

# Put the embeddings4 calculated in for each document to a vector V
V2 = np.array([embedding for embedding in embeddings4])

# Calculate the cosine similarity between each embedding and the query
cosine_similarities2 = V2.dot(embedding1)

# Find the index of the maximum cosine similarity
max_index2 = np.argmax(cosine_similarities2)

# Print the document with the maximum cosine similarity
print("The index of the maximum cosine similarity is: ", max_index2)
print("The maximum cosine similarity is: ", cosine_similarities2[max_index2])
print("The most similar document is: ", documents[max_index2]['text'])

The index of the maximum cosine similarity is:  2
The maximum cosine similarity is:  0.7002325
The most similar document is:  The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.


5. Selecting the embedding model