## Preparation

In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


## Q1 Min Search

In [3]:
import minsearch

In [4]:
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x740d9ed63140>

In [5]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
minsearch_relevance = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    minsearch_relevance.append(relevance)

100%|██████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 318.02it/s]


In [7]:
minsearch_relevance[10]

[False, False, False, True, False]

In [8]:
hit_rate(minsearch_relevance)

0.848714069591528

## Embeddings

In [9]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [10]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [11]:
ground_truth_questions = [None] * len(ground_truth)

# get questions from ground_truth
for i, q in enumerate(tqdm(ground_truth)):
    ground_truth_questions[i] = q['question']

# embedding of ground_truth questions
Y = pipeline.transform(ground_truth_questions)

# add embedding to correspondent question in ground_truth
for i, q in enumerate(ground_truth):
    q['vector_question'] = Y[i]

100%|███████████████████████████████████████████████████████████████| 4627/4627 [00:00<00:00, 471926.77it/s]


In [13]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x740d9e1756a0>

In [14]:
def minsearch_vector(q):
    return vindex.search(
        query_vector=q['vector_question'],
        filter_dict={'course': q['course']},
        num_results=5,
        output_ids=True
    )

In [15]:
evaluate(ground_truth, minsearch_vector)

100%|█████████████████████████████████████████████████████████████████| 4627/4627 [00:03<00:00, 1229.35it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

## Q3 Vector search for question and answer

In [16]:
texts_q = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_q.append(t)

# pipeline = make_pipeline(
#     TfidfVectorizer(min_df=3),
#     TruncatedSVD(n_components=128, random_state=1)
# )
X_q = pipeline.fit_transform(texts_q)

In [18]:
doc_idx = {d['id']: d for d in documents}

In [22]:
ground_truth_questions_q = [None] * len(ground_truth)

# get questions from ground_truth
for i, q in enumerate(tqdm(ground_truth)):
    ground_truth_questions_q[i] = q['question']

# embedding of ground_truth questions
Y = pipeline.transform(ground_truth_questions_q)

# add embedding to correspondent question in ground_truth
for i, q in enumerate(ground_truth):
    q['vector_question_q'] = Y[i]

100%|██████████████████████████████████████████████████████████████| 4627/4627 [00:00<00:00, 1520332.52it/s]


In [23]:
vindex_q = VectorSearch(keyword_fields={'course'})
vindex_q.fit(X_q, documents)

<minsearch.vector.VectorSearch at 0x740d9ddacb90>

In [24]:
def minsearch_vector_q(q):
    return vindex_q.search(
        query_vector=q['vector_question_q'],
        filter_dict={'course': q['course']},
        num_results=5,
        output_ids=True
    )

In [25]:
evaluate(ground_truth, minsearch_vector_q)

100%|█████████████████████████████████████████████████████████████████| 4627/4627 [00:03<00:00, 1266.39it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4 Qdrant

In [27]:
from qdrant_client import QdrantClient, models
qdrant_uri = 'http://localhost:6333'
client = QdrantClient(qdrant_uri)

In [29]:
collection_name="homework-03"
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [33]:
EMBEDDING_DIMENSIONALITY=512

In [34]:
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [35]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [36]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    point = models.PointStruct(
        id=i,
        vector=models.Document(text=text, model=model_handle),
        payload=doc
    )
    points.append(point)
    

In [37]:
client.upsert(collection_name=collection_name, points=points)

Fetching 5 files: 100%|█████████████████████████████████████| 5/5 [00:08<00:00,  1.76s/it]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [42]:
def qdrant_vector_search(query, limit=5):
    query_points = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query['question'],
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )

    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results


In [43]:
evaluate(ground_truth, qdrant_vector_search)

100%|█████████████████████████████████████████████████| 4627/4627 [01:11<00:00, 64.83it/s]


{'hit_rate': 0.9120380376053598, 'mrr': 0.8248685253223843}

## Q5. Cosine simiarity

In [44]:
import numpy as np

In [46]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [47]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [49]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [50]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [51]:
results_dict = df_results.to_dict(orient='records')

In [57]:
cosines = []

for i, entry in enumerate(results_dict):
    v_orig = pipeline.transform([entry['answer_orig']])[0]
    v_llm = pipeline.transform([entry['answer_llm']])[0]
    cosines.append(cosine(v_llm, v_orig))






In [59]:
df_results['cosine'] = cosines

In [60]:
df_results.describe()

Unnamed: 0,cosine
count,1830.0
mean,0.841584
std,0.173737
min,0.079093
25%,0.806927
50%,0.905812
75%,0.950711
max,0.996457


## Q6. Rouge

In [62]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]

In [66]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [74]:

def compute_rouge_score(df):
    f1_scores = []
    results_dict = df.to_dict(orient='records')
    for doc in tqdm(results_dict):
        scores = rouge_scorer.get_scores(doc['answer_llm'], doc['answer_orig'])
        rouge_1_f1_score = scores[0]['rouge-1']['f']
        f1_scores.append(rouge_1_f1_score)

    return f1_scores
        

In [75]:
scores = compute_rouge_score(df_results)

100%|████████████████████████████████████████████████| 1830/1830 [00:05<00:00, 322.41it/s]


In [77]:
df_results['rouge_1_f1_score'] = scores

In [78]:
df_results.describe()

Unnamed: 0,cosine,rouge_1_f1_score
count,1830.0,1830.0
mean,0.841584,0.351695
std,0.173737,0.158905
min,0.079093,0.0
25%,0.806927,0.238887
50%,0.905812,0.3563
75%,0.950711,0.460133
max,0.996457,0.95
