In [1]:
!pip install -U minsearch qdrant_client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [10]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [11]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"] #important fields
)

index.fit(documents)

<minsearch.minsearch.Index at 0x77f93aae4920>

In [12]:
def min_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [13]:
evaluation_result = evaluate(ground_truth, lambda q: min_search(q['question'], q['course']))

100%|█| 4627/4627 [00:14<00:00, 308.75it/s


In [14]:
print(evaluation_result)

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}


# Question 2 and 3

In [15]:
from minsearch import VectorSearch

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [17]:
# texts = []

# for doc in documents:
#     t = doc['question']
#     texts.append(t)

texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [18]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77f938bfd580>

In [19]:
def vector_search(query_vector, course):

    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [20]:
def vector_evaluate(ground_truth, search_function):
    relevance_total = []

    for idx, q in enumerate(tqdm(ground_truth)):
        doc_id = q['document']
        query_text = q['question']
        query_vector = pipeline.transform([query_text])[0]
        results = search_function(query_vector, q['course'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [21]:
evaluation_result = vector_evaluate(ground_truth, lambda query_vec, course: vector_search(query_vec, course))

100%|█| 4627/4627 [00:08<00:00, 517.07it/s


In [22]:
print(evaluation_result)

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}


# Question 4

In [23]:
from qdrant_client import QdrantClient, models

In [24]:
qd_client = QdrantClient("http://localhost:6333")   

In [25]:
from fastembed import TextEmbedding

In [26]:
EMBEDDING_DIMENSIONALITY = 512

In [27]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
collection_name = "evaluation_llm"

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY, 
        distance=models.Distance.COSINE  
    )
)

In [29]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [30]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + " " + doc['text']
    vector=models.Document(text=text, model=model_handle)
    
    point = models.PointStruct(
        id=i, 
        vector=vector,
        payload=doc
)
    points.append(point)

In [31]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|█| 5/5 [00:11<00:00


UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
def qdrant_vector_search(query, course, limit=5):
    results = []
    result = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, 
        with_payload=True 
    )
    for point in result.points:
        results.append(point.payload)
    
    return results

In [35]:
def qdrant_evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [37]:
evaluation_result = qdrant_evaluate(ground_truth, lambda q: qdrant_vector_search(q['question'], q['course']))

100%|█| 4627/4627 [01:20<00:00, 57.13it/s]


In [38]:
print(evaluation_result)

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}


# Question 5

In [39]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [50]:
df_results['answer_llm']

0       You can sign up for the course by visiting the...
1       You can sign up using the link provided in the...
2       Yes, there is an FAQ for the Machine Learning ...
3       The context does not provide any specific info...
4       To structure your questions and answers for th...
                              ...                        
1825    Some suggested titles for listing the Machine ...
1826    It is best advised that you do not list the Ma...
1827    You can incorporate your Machine Learning Zoom...
1828    The advice on including a project link in a CV...
1829    The suggestion to showcase progress through Li...
Name: answer_llm, Length: 1830, dtype: object

In [42]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [65]:
import numpy as np

In [63]:
def cosine_similarity(data):
    answer_llm = data['answer_llm']
    answer_orig = data['answer_orig']
    
    v_llm = pipeline.transform([answer_llm])
    v_orig = pipeline.transform([answer_orig])

    v_llm = v_llm[0]
    v_orig = v_orig[0]
    
    v_llm_norm = np.sqrt(v_llm.dot(v_llm))
    v_orig_norm = np.sqrt(v_orig.dot(v_orig))
    return v_llm.dot(v_orig) / (v_llm_norm * v_orig_norm)

In [74]:
similarity = []
results = df_results.to_dict(orient='records')

for record in tqdm(results):
    sim = cosine_similarity(record)
    similarity.append(float(sim))

100%|█| 1830/1830 [00:03<00:00, 517.55it/s


In [84]:
average = sum(similarity) / len(similarity)

In [85]:
average

0.8415841233490402

# Question 6

In [86]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [91]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [94]:
sum = 0.0
for r in df_results.itertuples():
    scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    rouge = scores['rouge-1']['f']
    sum += rouge

In [96]:
rouge_average = sum/len(df_results)

In [97]:
rouge_average

0.3516946452113944