<a href="https://colab.research.google.com/github/thant-san/llm-zoomcamp/blob/main/03_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U minsearch qdrant_client tdqm

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker<4.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Downloading qdrant_client-1.15.0-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=48bb308a6c5c37761229d6bb971271dcf7841c6832fb9c1c04eb6bc37db2a640
  Stored in directory: /root/.cache/pip/wheel

In [2]:
import minsearch
print(minsearch.__version__)


0.0.4


In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [5]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [5]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [18]:
from minsearch import Index

index =Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

def search_function(q):
    return index.search(
        query=q['question'],
        boost_dict={'question': 1.5, 'section': 0.1}
    )

In [19]:
evaluate(ground_truth,search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8597363302355738, 'mrr': 0.689783053917484}

In [7]:
from minsearch import VectorSearch

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [9]:
# @title For questions
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

def search_function(q):
    query_embedding = pipeline.transform([q['question']])
    return vindex.search(query_embedding[0])

evaluate(ground_truth, search_function)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.4696347525394424, 'mrr': 0.30038293179097125}

In [34]:
# @title For questions and answers
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

def search_function(q):
    query_embedding = pipeline.transform([q['question']])
    return vindex.search(query_embedding[0])

evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8415820185865571, 'mrr': 0.6252495703273756}

In [12]:
from sentence_transformers import SentenceTransformer
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5

In [14]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

# 1. Create sentence embeddings
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5

model = SentenceTransformer(model_handle)
embeddings = model.encode(texts, show_progress_bar=True)

# 2. Create a Qdrant client
qdrant_client = QdrantClient(":memory:")

# 3. Create a Qdrant collection and add documents
qdrant_client.recreate_collection(
    collection_name="my_collection",
    vectors_config=models.VectorParams(size=embeddings.shape[1], distance=models.Distance.COSINE),
)

qdrant_client.upload_points(
    collection_name="my_collection",
    points=[
        models.PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload=doc,
        )
        for idx, (vector, doc) in enumerate(zip(embeddings, documents))
    ],
)


# 4. Define a new search_function
def search_function(q):
    query_text = q['question'] + ' ' + q.get('text', '')  # Combine question and text for the query
    query_embedding = model.encode(query_text)

    search_result = qdrant_client.search(
        collection_name="my_collection",
        query_vector=query_embedding.tolist(),
        limit=limit,
    )

    return [hit.payload for hit in search_result]

# 5. Evaluate the MRR
evaluation_results = evaluate(ground_truth, search_function)
print(f"MRR: {evaluation_results['mrr']}")

# 6. Answer the multiple-choice question
mrr = evaluation_results['mrr']
if 0.6 <= mrr < 0.7:
    print("The MRR is closest to 0.65")
elif 0.7 <= mrr < 0.8:
    print("The MRR is closest to 0.75")
elif 0.8 <= mrr < 0.9:
    print("The MRR is closest to 0.85")
elif 0.9 <= mrr <= 1.0:
    print("The MRR is closest to 0.95")

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

  qdrant_client.recreate_collection(


  0%|          | 0/4627 [00:00<?, ?it/s]

  search_result = qdrant_client.search(


MRR: 0.09091203803760516


In [15]:
def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

In [16]:
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

In [17]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [18]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [19]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [20]:
import numpy as np

v_llm = pipeline.transform(df_results.answer_llm)
v_orig = pipeline.transform(df_results.answer_orig)

df_results['cosine'] = [cosine(v_llm[i], v_orig[i]) for i in range(len(df_results))]

In [23]:
display(df_results['cosine'])

Unnamed: 0,cosine
0,0.463526
1,0.781565
2,0.889158
3,0.614962
4,0.624086
...,...
1825,0.907584
1826,0.965069
1827,0.965395
1828,0.716734


In [25]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [26]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [27]:
from rouge import Rouge

rouge_scorer = Rouge()

rouge_1_scores = []
for i, row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    rouge_1_scores.append(scores['rouge-1']['f'])

df_results['rouge-1'] = rouge_1_scores

print(f"Average ROUGE-1 F1-score: {df_results['rouge-1'].mean()}")

Average ROUGE-1 F1-score: 0.3516946452113943
