In [None]:
import requests
import pandas as pd
import numpy as np
import minsearch

In [20]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [21]:
from tqdm.auto import tqdm

def hit_rate(relevance_total): # Define hitrate evaluation function
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Answer:1 

In [9]:
# Create the Minsearch index with specified text and keyword fields
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x183db92f250>

In [10]:
# Define boosting parameters as specified
boost = {'question': 1.5, 'section': 0.1}

In [11]:
# Define the search function that uses boosting and filters by course
def minsearch_search(query, course):
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )
    return results

In [12]:
# Evaluate hitrate over all queries in ground truth
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [13]:
hit = hit_rate(relevance_total)
print("Minsearch hitrate (with boost):", hit)

Minsearch hitrate (with boost): 0.848714069591528


### Answer:2

In [21]:
import requests
import pandas as pd
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
import numpy as np
import minsearch  # Your minsearch.py containing Index class

# Load data
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
documents = requests.get(url_prefix + 'search_evaluation/documents-with-ids.json').json()
df_ground_truth = pd.read_csv(url_prefix + 'search_evaluation/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

# Create embeddings
texts = [doc['question'] for doc in documents]
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Build index and inject embeddings
index = minsearch.Index(text_fields=[], keyword_fields=['course', 'id'])
index.docs = documents
index.keyword_df = pd.DataFrame({field: [doc.get(field, '') for doc in documents] for field in index.keyword_fields})
index.embeddings = X  # Add embeddings attribute

def vector_search(idx, query_vec, filter_dict={}, num_results=5):
    def cosine_similarity(a, b):
        a_norm = np.linalg.norm(a)
        b_norms = np.linalg.norm(b, axis=1)
        return np.dot(b, a) / (b_norms * a_norm + 1e-10)

    if hasattr(query_vec, 'toarray'):
        qvec = query_vec.toarray().flatten()
    else:
        qvec = query_vec.flatten()

    sims = cosine_similarity(qvec, idx.embeddings)

    mask = np.ones(len(idx.docs), dtype=bool)
    for field, val in filter_dict.items():
        if field in idx.keyword_fields:
            arr = np.array(idx.keyword_df[field])
            mask = mask & (arr == val)

    sims = sims * mask

    top_indices = np.argpartition(sims, -num_results)[-num_results:]
    top_indices = top_indices[np.argsort(-sims[top_indices])]

    results = [idx.docs[i] for i in top_indices if sims[i] > 0]
    if not results:
        # if no documents with positive similarity, fall back to top ranked regardless of positive score
        results = [idx.docs[i] for i in top_indices]
    return results

# Search function wrapped for evaluation
def search_fn(q):
    query_vec = pipeline.transform([q['question']])
    return vector_search(index, query_vec, filter_dict={'course': q['course']}, num_results=5)

# MRR evaluation
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

# Evaluate MRR
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_fn(q)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

score = mrr(relevance_total)
print("MRR (Vector Search, question field):", score)


  0%|          | 0/4627 [00:00<?, ?it/s]

MRR (Vector Search, question field): 0.35673582594913916


### Answer:3


In [None]:
# Combine question and text for document embeddings
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Build the index
index = minsearch.Index(text_fields=[], keyword_fields=['course', 'id'])
index.docs = documents
index.keyword_df = pd.DataFrame({field: [doc.get(field, '') for doc in documents] for field in index.keyword_fields})
index.embeddings = X

def vector_search(index, query_vec, filter_dict={}, num_results=5):
    def cosine_similarity(a, b):
        a_norm = np.linalg.norm(a)
        b_norms = np.linalg.norm(b, axis=1)
        return np.dot(b, a) / (b_norms * a_norm + 1e-10)

    if hasattr(query_vec, 'toarray'):
        qvec = query_vec.toarray().flatten()
    else:
        qvec = query_vec.flatten()
    sims = cosine_similarity(qvec, index.embeddings)

    mask = np.ones(len(index.docs), dtype=bool)
    for field, val in filter_dict.items():
        if field in index.keyword_fields:
            arr = np.array(index.keyword_df[field])
            mask = mask & (arr == val)

    sims = sims * mask
    top_indices = np.argpartition(sims, -num_results)[-num_results:]
    top_indices = top_indices[np.argsort(-sims[top_indices])]
    results = [index.docs[i] for i in top_indices if sims[i] > 0]
    if not results:
        results = [index.docs[i] for i in top_indices]
    return results

def vector_search_question_text(q):
    query_text = q['question']
    course_filter = q['course']
    query_vec = pipeline.transform([query_text])
    return vector_search(index, query_vec, filter_dict={'course': course_filter}, num_results=5)

def hit_rate(relevance_total):
    return sum(True in line for line in relevance_total) / len(relevance_total)

relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search_question_text(q)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit = hit_rate(relevance_total)
print("Hitrate (Vector Search question + text):", hit)


  0%|          | 0/4627 [00:00<?, ?it/s]

Hitrate (Vector Search question + text): 0.8210503566025502


### Answer:4

In [None]:
#!/usr/bin/env python

import os
import sys
import time
import tempfile
import requests
import json
import pandas as pd
from tqdm.auto import tqdm

from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance
from fastembed import TextEmbedding

# ------------------------------------------------------------------#
# 0. Global configuration
# ------------------------------------------------------------------#
QDRANT_HOST       = os.getenv("QDRANT_HOST", "http://localhost:6333")
COLLECTION        = "zoomcamp-rag"
EMBED_MODEL       = "jinaai/jina-embeddings-v2-small-en"
VECTOR_DIM        = 512              # According to FastEmbed manifest
TOP_K             = 5                # Evaluation limit
TMP_CACHE_DIR     = os.path.join(tempfile.gettempdir(), "fastembed_cache")

URL_PREFIX        = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
DOCS_URL          = URL_PREFIX + "search_evaluation/documents-with-ids.json"
GROUND_TRUTH_URL  = URL_PREFIX + "search_evaluation/ground-truth-data.csv"

# ------------------------------------------------------------------#
# 1. Helper functions for metrics
# ------------------------------------------------------------------#
def hit_rate(relevance_total):
    """Proportion of queries where the correct doc is in the candidate list."""
    return sum(any(line) for line in relevance_total) / len(relevance_total)

def mrr(relevance_total):
    """Mean Reciprocal Rank over binary relevance lists."""
    total = 0.0
    for line in relevance_total:
        for idx, is_rel in enumerate(line):
            if is_rel:
                total += 1.0 / (idx + 1)
                break
    return total / len(relevance_total)

# ------------------------------------------------------------------#
# 2. Dataset download
# ------------------------------------------------------------------#
def download_dataset():
    print("📥  Downloading dataset …")
    docs   = requests.get(DOCS_URL, timeout=30).json()
    gtruth = pd.read_csv(GROUND_TRUTH_URL).to_dict(orient="records")
    print(f"   → Documents      : {len(docs)}")
    print(f"   → Ground-truth Q : {len(gtruth)}")
    return docs, gtruth

# ------------------------------------------------------------------#
# 3. Qdrant initialization
# ------------------------------------------------------------------#
def init_qdrant():
    client = QdrantClient(QDRANT_HOST)
    if client.collection_exists(COLLECTION):
        print(f"🗑️  Deleting stale collection “{COLLECTION}” …")
        client.delete_collection(collection_name=COLLECTION)
        # Wait a moment to avoid race conditions with re-creation
        time.sleep(0.3)

    print(f"🆕  Creating collection “{COLLECTION}” …")
    client.create_collection(
        collection_name = COLLECTION,
        vectors_config  = models.VectorParams(
            size     = VECTOR_DIM,
            distance = Distance.COSINE
        )
    )
    return client

# ------------------------------------------------------------------#
# 4. Embedding & ingestion
# ------------------------------------------------------------------#
def ingest_documents(client, documents):
    print("🧪  Loading embedding model (FastEmbed)…")
    embedder = TextEmbedding(model_name=EMBED_MODEL, cache_dir=TMP_CACHE_DIR)

    # ❶ Build the full text list first
    texts   = [f"{d['question']} {d['text']}" for d in documents]
    ids     = [d["id"] for d in documents]

    # ❷ Generate all vectors at once (generator → list)
    vectors = list(embedder.embed(texts))      # ← converts generator

    # ❸ Package points for Qdrant
    points  = [
        models.PointStruct(
            id      = pid,
            vector  = vec,
            payload = {
                "question": documents[i]["question"],
                "text"    : documents[i]["text"]
            }
        )
        for i, (pid, vec) in enumerate(zip(ids, vectors))
    ]

    print("⏫  Upserting into Qdrant …")
    client.upsert(collection_name=COLLECTION, points=points)
    print("✅  Ingestion complete")
# ------------------------------------------------------------------#
# 5. Search wrapper
# ------------------------------------------------------------------#
def make_search_fn(client):
    embedder = TextEmbedding(model_name=EMBED_MODEL, cache_dir=TMP_CACHE_DIR)

    def _search(q):
        q_vec = embedder.embed([q["question"]])[0]
        hits  = client.search(
            collection_name = COLLECTION,
            query_vector    = q_vec,
            limit           = TOP_K
        )
        # Reshape into expected dict list
        return [{"id": hit.id} for hit in hits]

    return _search

# ------------------------------------------------------------------#
# 6. Evaluation orchestrator
# ------------------------------------------------------------------#
def evaluate(ground_truth, search_fn):
    relevance_total = []
    for q in tqdm(ground_truth, desc="🥇  Evaluating", unit="query"):
        correct_id  = q["document"]
        candidates  = search_fn(q)
        line        = [cand["id"] == correct_id for cand in candidates]
        relevance_total.append(line)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr"     : mrr(relevance_total)
    }

# ------------------------------------------------------------------#
# 7. Main
# ------------------------------------------------------------------#
def main():
    print("🚀  Starting pipeline")
    docs, gtruth  = download_dataset()
    client        = init_qdrant()
    ingest_documents(client, docs)
    search_fn     = make_search_fn(client)
    metrics       = evaluate(gtruth, search_fn)

    print("\n📊  Final metrics")
    print(f"Hit Rate: {metrics['hit_rate']:.4f}")
    print(f"MRR      : {metrics['mrr']:.4f}")

# Entry-point guard
if __name__ == "__main__":
    main()