In [1]:
from fastembed import TextEmbedding
import numpy as np
import pprint
import requests
from qdrant_client import QdrantClient, models
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


1. Embedding the query

In [2]:
# Initialize the text embedding model with the specified pre-trained model
model = TextEmbedding(model_name='jinaai/jina-embeddings-v2-small-en')

# Define the input query to be embedded
query = 'I just discovered the course. Can I join now?'

# Generate the embedding for the query and extract the first (and only) embedding from the list
embedding1 = list(model.embed([query]))[0]

# Print the shape of the embedding vector
print(embedding1.shape)

# Print the minimum value in the embedding vector
print(round(np.min(embedding1), 2))

(512,)
-0.12


In [3]:
# Verify the normalization of the embedding to be 1.0
print(np.linalg.norm(embedding1))

# Check the cosine similarity of same embedding
print(embedding1.dot(embedding1))

1.0
1.0000000000000002


2. Cosine similarity with another vector

In [4]:
doc = 'Can I still join the course after the start date?'

# Generate the embedding for the query and extract the first (and only) embedding from the list
embedding2 = list(model.embed([doc]))[0]

# Check the cosine similarity between the two embeddings
print(round(embedding1.dot(embedding2), 1))

0.9


In [5]:
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

# Generate the embedding for the documents and extract the first (and only) embedding from the list
embeddings3 = list(model.embed([doc['text'] for doc in documents]))

3. Ranking by cosine

In [6]:
# Put the embeddings3 calculated in for each document to a vector V
V1 = np.array([embedding for embedding in embeddings3])

# Calculate the cosine similarity between each embedding and the query
cosine_similarities1 = V1.dot(embedding1)

# Find the index of the maximum cosine similarity
max_index1 = np.argmax(cosine_similarities1)

# Print the document with the maximum cosine similarity
print("The index of the maximum cosine similarity is: ", max_index1)
print("\nThe maximum cosine similarity is: ", round(cosine_similarities1[max_index1], 2))
print("\nThe most similar document is: ", documents[max_index1]['text'])

The index of the maximum cosine similarity is:  1

The maximum cosine similarity is:  0.82

The most similar document is:  Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.


In [7]:
# For each doc in documents, concatenate the question and text into a single string
full_text = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Generate the embedding for the full text and extract the first (and only) embedding from the list
embeddings4 = list(model.embed(full_text))

# Put the embeddings4 calculated in for each document to a vector V
V2 = np.array([embedding for embedding in embeddings4])

# Calculate the cosine similarity between each embedding and the query
cosine_similarities2 = V2.dot(embedding1)

# Find the index of the maximum cosine similarity
max_index2 = np.argmax(cosine_similarities2)

# Print the document with the maximum cosine similarity
print("The index of the maximum cosine similarity is: ", max_index2)
print("\nThe maximum cosine similarity is: ", round(cosine_similarities2[max_index2], 2))
print("\nThe most similar document is: ", documents[max_index2]['text'])

The index of the maximum cosine similarity is:  0

The maximum cosine similarity is:  0.85

The most similar document is:  Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


5. Selecting the embedding model

In [8]:
# List all available text embedding models
available_models = TextEmbedding.list_supported_models()

# Check the parameters of the first model
pprint.pprint(available_models[0])

{'additional_files': [],
 'description': 'Text embeddings, Unimodal (text), English, 512 input tokens '
                'truncation, Prefixes for queries/documents: necessary, 2023 '
                'year.',
 'dim': 768,
 'license': 'mit',
 'model': 'BAAI/bge-base-en',
 'model_file': 'model_optimized.onnx',
 'size_in_GB': 0.42,
 'sources': {'_deprecated_tar_struct': True,
             'hf': 'Qdrant/fast-bge-base-en',
             'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'},
 'tasks': {}}


In [9]:
# Find the smallest dimension
smallest_dim = min(model['dim'] for model in available_models)

# List all models with that dimension
smallest_models = [model['model'] for model in available_models if model['dim'] == smallest_dim]

print(f"The smallest dimension is {smallest_dim}")

print("\nModels with the smallest dimension:")
for m in smallest_models:
    print(m)

The smallest dimension is 384

Models with the smallest dimension:
BAAI/bge-small-en
BAAI/bge-small-en-v1.5
snowflake/snowflake-arctic-embed-xs
snowflake/snowflake-arctic-embed-s
sentence-transformers/all-MiniLM-L6-v2
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


6. Indexing with qdrant

Start a Qdrant vector database instance using Docker.

`docker pull qdrant/qdrant`

`docker run -p 6333:6333 -p 6334:6334 -v "$(pwd)/qdrant_storage:/qdrant/storage:z" qdrant/qdrant`

Parameters:
 - `6333`: Used for HTTP API
 - `6334`: Used for gRPC API
 - `-v "$(pwd)/qdrant_storage:/qdrant/storage:z"`: Mounts the `qdrant_storage` directory from your current working directory to the container's `/qdrant/storage` directory, allowing data persistence, the `:z` option is for SELinux compatibility

In [None]:
# Fetching new documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# Check the number of documents
print(f"Number of documents: {len(documents)}")

# Check the first document
# print("\nFirst document:")
# pprint.pprint(documents[0])

collection_name = "zoomcamp-faq"
model_small="BAAI/bge-small-en"

# Create a client to connect to the Qdrant server
client = QdrantClient(url="http://localhost:6333")

# Delete the collection
# client.delete_collection(collection_name=collection_name)

# Create a collection
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=384,
        distance=models.Distance.COSINE
        ),
)

client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword",
)

# Add both question and answer fields
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_small)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

# Add the documents to the collection along with progress bar
for point in tqdm(points, desc="Adding documents to qdrant"):
    client.upsert(
        collection_name=collection_name,
        points=[point]
    )
 
# client.upsert(
#     collection_name=collection_name,
#     points=points
# )

Number of documents: 375


Adding documents to qdrant:   9%|▉         | 33/375 [00:02<00:20, 16.86it/s]

In [11]:
query_points = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_small 
        ),
        limit=1,
        with_payload=True
    )
    
print("Highest score: ", round(query_points.points[0].score, 2))
print("\nQuery: ", query)
print("\nAnswer: ", query_points.points[0].payload['text'])

Highest score:  0.87

Query:  I just discovered the course. Can I join now?

Answer:  Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.
