In [None]:
# !pip install -U minsearch qdrant_client rouge scikit-learn tqdm requests pandas numpy

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.14.3-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Downloading qdrant_client-1.14.3-py3-none-any.whl (328 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m[31m8.4 MB/s[0m eta [36m0:00:01[0m
[?25hUsing cached tqdm-

In [5]:
import requests, pandas as pd, numpy as np
from tqdm.auto import tqdm

# # Datos
# url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
# docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
# ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'

# documents       = requests.get(docs_url).json()
# df_ground_truth = pd.read_csv(ground_truth_url)
# ground_truth    = df_ground_truth.to_dict(orient='records')

# Métricas de evaluación
def hit_rate(relevance_total):
    return sum(any(line) for line in relevance_total) / len(relevance_total)

def mrr(relevance_total):
    score = 0.0
    for line in relevance_total:
        for rank, rel in enumerate(line, 1):
            if rel:
                score += 1 / rank
                break
    return score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id  = q['document']
        results = search_function(q)
        relevance_total.append([d['id'] == doc_id for d in results])
    return {'hit_rate': hit_rate(relevance_total), 'mrr': mrr(relevance_total)}


In [None]:
import json, pandas as pd, numpy as np
from tqdm.auto import tqdm
from pathlib import Path

# Carpeta donde están los archivos ('' = directorio actual)
DATA_DIR = Path("")          # o Path("data") si los tienes en ./data/

# ---------- Documentos del FAQ ----------
with open(DATA_DIR / "documents-with-ids.json", "r", encoding="utf-8") as f:
    documents = json.load(f)

# ---------- Ground-truth (pregunta → doc_id correcto) ----------
df_ground_truth = pd.read_csv(DATA_DIR / "ground-truth-data.csv")
ground_truth    = df_ground_truth.to_dict(orient='records')

print(f"Docs: {len(documents)}, GT pairs: {len(ground_truth)}")

Docs: 948, GT pairs: 4627


In [9]:
df_results = pd.read_csv(DATA_DIR / "results-gpt4o-mini.csv")
display(df_results.head(1))

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp


# Q1

In [None]:
from minsearch import Index  

boost = {'question': 1.5, 'section': 0.1}

index = Index(
    text_fields=['question', 'section', 'text'],   
    keyword_fields=['course']                      
)
index.fit(documents)

def search_q1(q, k=5):
    return index.search(
        q['question'],
        boost_dict=boost,    
        num_results=k
    )

print(evaluate(ground_truth, search_q1))

  0%|          | 0/4627 [00:00<?, ?it/s]

Q1: {'hit_rate': 0.8013831856494489, 'mrr': 0.6815251062603574}


* {'hit_rate': 0.8013831856494489, 'mrr': 0.6815251062603574}

# Q2

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from minsearch import VectorSearch

texts_q = [d["question"] for d in documents]

pipe_q = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X_q = pipe_q.fit_transform(texts_q)

vindex_q = VectorSearch(keyword_fields={"course"})
vindex_q.fit(X_q, documents)

def search_q2(q, k=5):
    vec = pipe_q.transform([q["question"]])
    return vindex_q.search(vec, num_results=k)

print("Q2:", evaluate(ground_truth, search_q2))

  0%|          | 0/4627 [00:00<?, ?it/s]

Q2: {'hit_rate': 0.3939917873352064, 'mrr': 0.2898890569843674}


* Q2: {'hit_rate': 0.3939917873352064, 'mrr': 0.2898890569843674}

# Q3

In [15]:
texts_qa = [d["question"] + " " + d["text"] for d in documents]

pipe_qa = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X_qa = pipe_qa.fit_transform(texts_qa)

vindex_qa = VectorSearch(keyword_fields={"course"})
vindex_qa.fit(X_qa, documents)

def search_q3(q, k=5):
    vec = pipe_qa.transform([q["question"]])
    return vindex_qa.search(vec, num_results=k)

print("Q3:", evaluate(ground_truth, search_q3))

  0%|          | 0/4627 [00:00<?, ?it/s]

Q3: {'hit_rate': 0.7704776312945754, 'mrr': 0.6150097255240982}


* Q3: {'hit_rate': 0.7704776312945754, 'mrr': 0.6150097255240982}

# Q4

In [None]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
import numpy as np
from tqdm.auto import tqdm

model_name = "jinaai/jina-embeddings-v2-small-en"
embedder   = SentenceTransformer(model_name)

texts_q4 = [d["question"] + " " + d["text"] for d in documents]
emb_q4   = embedder.encode(texts_q4, batch_size=64, show_progress_bar=True)

client = QdrantClient(":memory:")
dim = emb_q4.shape[1]

collection = "faq_vecs"
client.recreate_collection(
    collection,
    vectors_config=models.VectorParams(size=dim, distance=models.Distance.COSINE)
)

payloads = [{**doc} for doc in documents]        
for p in payloads:
    p["orig_id"] = p["id"]                       

client.upload_collection(
    collection_name=collection,
    vectors=emb_q4,
    payload=payloads,           
    batch_size=256,
    parallel=4,
)

def search_q4(q, k=5):
    vec = embedder.encode([q["question"]])[0]
    hits = client.search(collection_name=collection, query_vector=vec, limit=k)
    return [{"id": h.payload["orig_id"], **h.payload} for h in hits]

print("Q4:", evaluate(ground_truth, search_q4))

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

  client.recreate_collection(


  0%|          | 0/4627 [00:00<?, ?it/s]

  hits = client.search(collection_name=collection, query_vector=vec, limit=k)


Q4: {'hit_rate': 0.12102874432677761, 'mrr': 0.07618327209855186}


* Q4: {'hit_rate': 0.12102874432677761, 'mrr': 0.07618327209855186}

# Q5

In [None]:
pipe_cos = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipe_cos.fit(
    df_results.answer_llm + " " +
    df_results.answer_orig + " " +
    df_results.question
)

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u));  v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

sims = []
for _, row in df_results.iterrows():
    v_llm  = pipe_cos.transform([row.answer_llm])[0]
    v_orig = pipe_cos.transform([row.answer_orig])[0]
    sims.append(cosine(v_llm, v_orig))

print("Q5 – Average cosine:", np.mean(sims))   

Q5 – Average cosine: 0.8415841233490403


* Q5 – Average cosine: 0.8415841233490403


# Q6

In [None]:
from rouge import Rouge
rouge = Rouge()

def rouge1_f1(ref, hyp):
    return rouge.get_scores(hyp, ref)[0]["rouge-1"]["f"]

scores = [rouge1_f1(r.answer_orig, r.answer_llm) for _, r in df_results.iterrows()]
print("Q6 – Average ROUGE-1 F1:", np.mean(scores)) 

Q6 – Average ROUGE-1 F1: 0.3516946452113943


* Q6 – Average ROUGE-1 F1: 0.3516946452113943