### Preparing Evaluation Data and Functions

In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [4]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


### Q1 - Minsearch Text

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x157e5fa2900>

In [None]:
def minsearch_text_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

We wrap the search function call in a lambda that receives the ground truth data query dictionary.

In [7]:
evaluate(ground_truth, lambda q: minsearch_text_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:18<00:00, 246.43it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Q2 - Vector Search For Question

TF-IDF and Singular Value Decomposition are used to create embeddings of 128 dimensions from texts.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [10]:
X.shape

(948, 128)

In [14]:
X[0]

array([ 0.20189188, -0.19028114, -0.10261914,  0.16435334, -0.14004852,
       -0.19928493,  0.0326298 ,  0.03152187,  0.11015991, -0.25056714,
       -0.2297715 , -0.08275686,  0.0107985 ,  0.01912367, -0.03904959,
        0.04858485, -0.03590806,  0.00200877, -0.20405168, -0.01187959,
        0.07217801,  0.21314061,  0.0352884 ,  0.09334844,  0.00800627,
        0.02730576, -0.05747045, -0.08794382,  0.04599191,  0.09568683,
        0.10378307, -0.12981451, -0.03935688,  0.03076194,  0.02946738,
       -0.02071025,  0.09501766,  0.05341492, -0.02582382,  0.08743149,
       -0.03647388, -0.168532  , -0.08957893,  0.03547496,  0.11095151,
        0.13033041, -0.07362053,  0.13634367,  0.09826041, -0.05042163,
        0.10989516,  0.00500543, -0.06848177,  0.0502295 ,  0.06014098,
        0.11383368,  0.05322441, -0.02371468, -0.0975518 ,  0.02646635,
        0.00609063, -0.00198324,  0.11985142,  0.08729537,  0.08522016,
        0.01295767,  0.03040857,  0.04797036, -0.00390723, -0.06

Minsearch now also supports vector search. We pass the computed embeddings and corresponding documents.

In [15]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x157e63d6a50>

In [None]:
def minsearch_vector_search(query, course):
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [25]:
evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:09<00:00, 485.89it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

### Q3 - Vector Search For Question And Answer

In [26]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)

In [27]:
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x157e63d6a50>

In [28]:
evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:16<00:00, 277.29it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

### Q4 - Qdrant

In [35]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [36]:
collection_name = 'search_evaluation'

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=client.get_embedding_size(model_handle),
        distance=models.Distance.COSINE
    )
)

True

In [39]:
client.upload_collection(
    collection_name=collection_name,
    vectors=[
        models.Document(
            text=doc['question'] + ' ' + doc['text'], 
            model=model_handle
        ) 
        for doc in documents
    ],
    payload=[doc for doc in documents],
    ids=[index for index in range(0, len(documents))],
)

In [40]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=16, status=<UpdateStatus.COMPLETED: 'completed'>)

In [47]:
def qdrant_vector_search(query, course):
    response = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    return [point.payload for point in response.points]

In [48]:
evaluate(ground_truth, lambda q: qdrant_vector_search(q['question'], q['course']))

100%|██████████| 4627/4627 [01:12<00:00, 64.22it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

### Q5 - Cosine Similarity

Remember that cosine similarity is a dot product between two normalized vectors.

In [50]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [None]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [52]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [53]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

The vectorizer (simple embedding apporach) is fit on all available evaluation text data so that it can learn to vectorize many words.

In [54]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [None]:
def compute_cosine_similarity(record):
    answer_llm = record['answer_llm']
    answer_orig = record['answer_orig']
    
    v_llm = pipeline.transform([answer_llm]).flatten()
    v_orig = pipeline.transform([answer_orig]).flatten()
    
    return cosine(v_llm, v_orig)

In [65]:
similarity = df_results.apply(compute_cosine_similarity, axis=1)
similarity.head()

0    0.463526
1    0.781565
2    0.889158
3    0.614962
4    0.624086
dtype: float64

In [67]:
similarity.mean()

np.float64(0.8415841233490402)

### Q6 - Rouge

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs. It can give a more detailed view of text similarity than just cosine similarity alone.

In [68]:
%pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Rouge for 10th document:

In [69]:
from rouge import Rouge

rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

There are three scores:
- `rouge-1` - the overlap of unigrams
- `rouge-2` - bigrams
- `rouge-l` - the longest common subsequence

Also, precision `p`, recall `r` and F1 `f` score for each.


Computing average rouge for all answer pairs:

In [74]:
scores = rouge_scorer.get_scores(df_results['answer_llm'], df_results['answer_orig'], avg=True)
scores

{'rouge-1': {'r': 0.3404359469772302,
  'p': 0.4299569796022711,
  'f': 0.3516946452113944},
 'rouge-2': {'r': 0.17516370344100232,
  'p': 0.2181134968015825,
  'f': 0.1767170469826221},
 'rouge-l': {'r': 0.3182147000427922,
  'p': 0.39908120209940684,
  'f': 0.32758565643306686}}

In [75]:
scores['rouge-1']['f']

0.3516946452113944