In [None]:
import requests
import pandas as pd
import numpy as np
import minsearch

In [20]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [21]:
from tqdm.auto import tqdm

def hit_rate(relevance_total): # Define hitrate evaluation function
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Answer:1 

In [9]:
# Create the Minsearch index with specified text and keyword fields
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x183db92f250>

In [10]:
# Define boosting parameters as specified
boost = {'question': 1.5, 'section': 0.1}

In [11]:
# Define the search function that uses boosting and filters by course
def minsearch_search(query, course):
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )
    return results

In [12]:
# Evaluate hitrate over all queries in ground truth
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [13]:
hit = hit_rate(relevance_total)
print("Minsearch hitrate (with boost):", hit)

Minsearch hitrate (with boost): 0.848714069591528


### Answer:2

In [21]:
import requests
import pandas as pd
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
import numpy as np
import minsearch  # Your minsearch.py containing Index class

# Load data
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
documents = requests.get(url_prefix + 'search_evaluation/documents-with-ids.json').json()
df_ground_truth = pd.read_csv(url_prefix + 'search_evaluation/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

# Create embeddings
texts = [doc['question'] for doc in documents]
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Build index and inject embeddings
index = minsearch.Index(text_fields=[], keyword_fields=['course', 'id'])
index.docs = documents
index.keyword_df = pd.DataFrame({field: [doc.get(field, '') for doc in documents] for field in index.keyword_fields})
index.embeddings = X  # Add embeddings attribute

def vector_search(idx, query_vec, filter_dict={}, num_results=5):
    def cosine_similarity(a, b):
        a_norm = np.linalg.norm(a)
        b_norms = np.linalg.norm(b, axis=1)
        return np.dot(b, a) / (b_norms * a_norm + 1e-10)

    if hasattr(query_vec, 'toarray'):
        qvec = query_vec.toarray().flatten()
    else:
        qvec = query_vec.flatten()

    sims = cosine_similarity(qvec, idx.embeddings)

    mask = np.ones(len(idx.docs), dtype=bool)
    for field, val in filter_dict.items():
        if field in idx.keyword_fields:
            arr = np.array(idx.keyword_df[field])
            mask = mask & (arr == val)

    sims = sims * mask

    top_indices = np.argpartition(sims, -num_results)[-num_results:]
    top_indices = top_indices[np.argsort(-sims[top_indices])]

    results = [idx.docs[i] for i in top_indices if sims[i] > 0]
    if not results:
        # if no documents with positive similarity, fall back to top ranked regardless of positive score
        results = [idx.docs[i] for i in top_indices]
    return results

# Search function wrapped for evaluation
def search_fn(q):
    query_vec = pipeline.transform([q['question']])
    return vector_search(index, query_vec, filter_dict={'course': q['course']}, num_results=5)

# MRR evaluation
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

# Evaluate MRR
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_fn(q)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

score = mrr(relevance_total)
print("MRR (Vector Search, question field):", score)


  0%|          | 0/4627 [00:00<?, ?it/s]

MRR (Vector Search, question field): 0.35673582594913916


### Answer:3


In [None]:
# Combine question and text for document embeddings
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Build the index
index = minsearch.Index(text_fields=[], keyword_fields=['course', 'id'])
index.docs = documents
index.keyword_df = pd.DataFrame({field: [doc.get(field, '') for doc in documents] for field in index.keyword_fields})
index.embeddings = X

def vector_search(index, query_vec, filter_dict={}, num_results=5):
    def cosine_similarity(a, b):
        a_norm = np.linalg.norm(a)
        b_norms = np.linalg.norm(b, axis=1)
        return np.dot(b, a) / (b_norms * a_norm + 1e-10)

    if hasattr(query_vec, 'toarray'):
        qvec = query_vec.toarray().flatten()
    else:
        qvec = query_vec.flatten()
    sims = cosine_similarity(qvec, index.embeddings)

    mask = np.ones(len(index.docs), dtype=bool)
    for field, val in filter_dict.items():
        if field in index.keyword_fields:
            arr = np.array(index.keyword_df[field])
            mask = mask & (arr == val)

    sims = sims * mask
    top_indices = np.argpartition(sims, -num_results)[-num_results:]
    top_indices = top_indices[np.argsort(-sims[top_indices])]
    results = [index.docs[i] for i in top_indices if sims[i] > 0]
    if not results:
        results = [index.docs[i] for i in top_indices]
    return results

def vector_search_question_text(q):
    query_text = q['question']
    course_filter = q['course']
    query_vec = pipeline.transform([query_text])
    return vector_search(index, query_vec, filter_dict={'course': course_filter}, num_results=5)

def hit_rate(relevance_total):
    return sum(True in line for line in relevance_total) / len(relevance_total)

relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search_question_text(q)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit = hit_rate(relevance_total)
print("Hitrate (Vector Search question + text):", hit)


  0%|          | 0/4627 [00:00<?, ?it/s]

Hitrate (Vector Search question + text): 0.8210503566025502


### Answer:4

In [None]:
pip install qdrant-client jinaai fastembed tqdm pandas requests

Collecting jinaai
  Downloading jinaai-0.2.10-py3-none-any.whl.metadata (17 kB)
Downloading jinaai-0.2.10-py3-none-any.whl (16 kB)
Installing collected packages: jinaai
Successfully installed jinaai-0.2.10
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from qdrant_client import QdrantClient, models
from qdrant_client.models import Filter, Distance, VectorParams, PointStruct, FieldCondition, MatchValue
from fastembed import TextEmbedding

In [7]:
# --- config ---

DIM = 512
model_handle = 'jinaai/jina-embeddings-v2-small-en'
embedder = TextEmbedding(model_name=model_handle)
limit = 5
collection_name = "faq_retrieval"

In [8]:
qd_client = QdrantClient("http://localhost:6333")

In [10]:
#qd_client.delete_collection(collection_name = collection_name)

In [12]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=DIM,
        distance=models.Distance.COSINE
    )
)

True

In [14]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [29]:
import requests
import pandas as pd
from tqdm.auto import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue


# Configuration
collection_name = "faq_retrieval"
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5
dim = 512  # Embedding dimension for the model, verify if needed

# Load data
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'

documents = requests.get(docs_url).json()
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

# Initialize Qdrant client, connecting to local or remote Qdrant instance
qd_client = QdrantClient(host="localhost", port=6333)  # Change host/port as needed

# Recreate collection to clear old data if needed
qd_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
)

# Initialize embedder
embedder = TextEmbedding(model_name=model_handle)

# Embed and prepare points to upsert — use real document IDs (strings), not int indices
points = []
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    vector = next(embedder.embed(text))  # embedding must be a numpy array or list
    point = PointStruct(
        id=doc['id'],  # Use document's real ID (string)
        vector=vector,
        payload=doc
    )
    points.append(point)

# Upload points to Qdrant
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

# Define search function
def qdrant_search(query, course):
    query_vec = next(embedder.embed(query["question"])).tolist()
    hits = qd_client.search(
        collection_name=collection_name,
        query_vector=query_vec,
        limit=limit,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="course",
                    match=MatchValue(value=course)
                )
            ]
        ),
        with_payload=True
    )
    result_docs = []
    for hit in hits:
        doc_id = hit.id
        # Find document by ID
        doc = next((d for d in documents if d["id"] == doc_id), None)
        if doc:
            result_docs.append(doc)
    return result_docs

# Define MRR metric
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank, rel in enumerate(line):
            if rel:
                total_score += 1 / (rank +1)
                break
    return total_score / len(relevance_total)

# Evaluate over ground truth
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q["document"]
    course = q["course"]
    results = qdrant_search(q, course)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

score = mrr(relevance_total)
print("MRR (Qdrant + jinaai/jina-embeddings-v2-small-en):", score)


  qd_client.recreate_collection(


UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Format error in JSON body: value c02e79ef is not a valid point ID, valid values are either an unsigned integer or a UUID"},"time":0.0}'