In [36]:
from sentence_transformers import SentenceTransformer
import requests
import numpy as np
from tqdm import tqdm
import pandas as pd

embeding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [26]:
user_question = "I just discovered the course. Can I still join it?"
embedding = embeding_model.encode(user_question)
print(embedding[0])

0.078222655


In [8]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

print(len(documents))

948


In [9]:
print(documents[0])

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp', 'id': 'c02e79ef'}


In [10]:
course = 'machine-learning-zoomcamp'
ml_documents = [item for item in documents if item['course'] == course]
print(len(ml_documents))

375


In [22]:
embeddings = list()
for ml_doc in tqdm(ml_documents):
    qa_text = f"{ml_doc['question']} {ml_doc['text']}"
    embeddings.append(embeding_model.encode(qa_text))

X = np.array(embeddings)
print(X.shape)

100%|██████████| 375/375 [00:36<00:00, 10.31it/s]

(375, 768)





In [27]:
embedding_score = embedding.dot(embedding)
print(embedding_score)

0.9999999


In [30]:
scores = X.dot(embedding)
print(scores.max())

0.6506573


In [34]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings
        
    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results - 1)[:num_results]
        return [self.documents[i] for i in idx]
    
search_engine = VectorSearchEngine(documents, X)
search_engine.search(embedding, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [None]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')