## Preparation

In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


## Q1 Min Search

In [4]:
import minsearch

In [5]:
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x72fcd55b4590>

In [6]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [7]:
minsearch_relevance = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    minsearch_relevance.append(relevance)

100%|██████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 317.03it/s]


In [24]:
minsearch_relevance[10]

[False, False, False, True, False]

In [8]:
hit_rate(minsearch_relevance)

0.848714069591528

## Embeddings

In [51]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [52]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [53]:
ground_truth_questions = [None] * len(ground_truth)

# get questions from ground_truth
for i, q in enumerate(tqdm(ground_truth)):
    ground_truth_questions[i] = q['question']

# embedding of ground_truth questions
Y = pipeline.transform(ground_truth_questions)

# add embedding to correspondent question in ground_truth
for i, q in enumerate(ground_truth):
    q['vector_question'] = Y[i]

100%|██████████████████████████████████████████████████████████████| 4627/4627 [00:00<00:00, 1610676.79it/s]


In [66]:
ground_truth[10]

{'question': 'Can I enroll in the course after it starts?',
 'course': 'data-engineering-zoomcamp',
 'document': '7842b56a',
 'vector_question': array([ 3.24407882e-01, -2.39724068e-01, -1.38986303e-01,  1.64746821e-01,
        -2.06825518e-01, -2.21634929e-02, -3.16873707e-02,  8.07068909e-02,
         9.57649119e-02, -3.12314795e-01, -2.56414216e-01, -7.36323957e-02,
         4.24134468e-02,  3.86402981e-02, -1.36343078e-01, -4.30954349e-02,
        -1.07711890e-02,  7.02276780e-02, -1.56171246e-01,  1.03608887e-01,
        -4.95673377e-02,  2.08087561e-01,  1.77977559e-02,  8.23224543e-02,
        -1.03444901e-01, -2.53648297e-02,  3.66052934e-02, -6.38937889e-02,
        -1.10721280e-01,  1.88322361e-02,  2.06317472e-02, -5.00333345e-02,
         7.98200025e-03, -4.02659100e-02, -5.28831080e-02, -8.36499430e-02,
        -3.33309913e-02, -6.01554364e-02,  1.16556820e-01, -6.51993567e-02,
         5.71069151e-02,  1.87052621e-02,  1.81269608e-03, -1.45068354e-01,
        -2.99023602e

In [55]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x72fcd1922930>

In [58]:
def minsearch_vector(q):
    return vindex.search(
        query_vector=q['vector_question'],
        filter_dict={'course': q['course']},
        num_results=5,
        output_ids=True
    )

In [59]:
evaluate(ground_truth, minsearch_vector)

100%|█████████████████████████████████████████████████████████████████| 4627/4627 [00:03<00:00, 1267.05it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

## Q3 Vector search for question and answer

In [74]:
texts_q = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_q.append(t)

# pipeline = make_pipeline(
#     TfidfVectorizer(min_df=3),
#     TruncatedSVD(n_components=128, random_state=1)
# )
X_q = pipeline.fit_transform(texts_q)

In [75]:
vindex_q = VectorSearch(keyword_fields={'course'})
vindex_q.fit(X_q, documents)

<minsearch.vector.VectorSearch at 0x72fcd1900da0>

In [67]:
doc_idx = {d['id']: d for d in documents}

In [76]:
ground_truth_questions_q = [None] * len(ground_truth)

# get questions from ground_truth
for i, q in enumerate(tqdm(ground_truth)):
    doc_id = q['document']
    document = doc_idx[doc_id]
    text = document['text']
    ground_truth_questions_q[i] = q['question'] + ' ' + text

# embedding of ground_truth questions
Y = pipeline.transform(ground_truth_questions_q)

# add embedding to correspondent question in ground_truth
for i, q in enumerate(ground_truth):
    q['vector_question_q'] = Y[i]

100%|███████████████████████████████████████████████████████████████| 4627/4627 [00:00<00:00, 761329.28it/s]


In [77]:
def minsearch_vector_q(q):
    return vindex_q.search(
        query_vector=q['vector_question_q'],
        filter_dict={'course': q['course']},
        num_results=5,
        output_ids=True
    )

In [78]:
evaluate(ground_truth, minsearch_vector_q)

100%|█████████████████████████████████████████████████████████████████| 4627/4627 [00:03<00:00, 1300.49it/s]


{'hit_rate': 1.0, 'mrr': 0.9982890281679996}