In [None]:
pip install sentence_transformers torch

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import torch
import os
import tqdm
import time
import pickle
from torch.utils.data import DataLoader, Dataset
import multiprocessing as mp
from sklearn.preprocessing import normalize

In [None]:
#df_collection = pd.read_csv("Data/collection.tsv", sep="\t", names=["doc_id", "passage"], on_bad_lines="skip")
df_queries = pd.read_csv("Data/queries.test.tsv", sep="\t", names=["query_id", "query"], on_bad_lines="skip")
df_qrels = pd.read_csv("Data/qrels.test.tsv", sep="\t", names=["query_id", "doc_id"])

In [None]:
"""
Load passage embeddings from pickle
Basemodel: embedding_passages
"""
embedding_path = "Data/embeddings"
start = time.time()
with open(f'{embedding_path}/embedding_passages_only_test_v0.pkl', 'rb') as file:
    embedding_data = pickle.load(file)
print(f"Duration: {time.time() - start} seconds")

In [None]:
"""
Load query embeddings from pickle
Basemodel: query_embeddings
"""
embedding_path = "Data/embeddings"
start = time.time()
with open(f'{embedding_path}/query_embeddings_test_v0.pkl', 'rb') as file:
    query_data = pickle.load(file)
print(f"Duration: {time.time() - start} seconds")

In [None]:
print(len(embedding_data["embeddings"]))

In [None]:
print(len(query_data["embeddings"]))

In [None]:
relevant_doc_ids = set(df_qrels['doc_id'].unique())

print(len(relevant_doc_ids))

In [None]:
"""
Load BM25 Results
"""
#with open('Data/models/bm25_passages.pkl', 'rb') as f:
#    bm25_results = pickle.load(f)

"""
Load Top 1000 Docs for each test query
"""
#with open('Data/models/queries_test_1000_passages.pkl', 'rb') as f:
#    top_1000_docs = pickle.load(f)

In [None]:
def calculate_mrr(ranked_doc_ids, relevant_doc_ids, k=10):
    for rank, doc_id in enumerate(ranked_doc_ids[:k], start=1):
        if doc_id in relevant_doc_ids:
            return 1.0 / rank
    return 0.0

def calculate_ndcg(ranked_doc_ids, relevant_doc_ids, k=20):
    relevance = np.zeros(k)
    for i, doc_id in enumerate(ranked_doc_ids[:k]):
        if doc_id in relevant_doc_ids:
            relevance[i] = 1
    return ndcg_score([relevance], [np.ones_like(relevance)], k=k)

def get_top_k_embeddings(top_k_indices, embedding_data):
    #top_k_indices = top_k_indices.flatten()
    #print(top_k_indices)
    
    # Extract Embeddings for top1000 Embeddings
    top_k_embeddings = embedding_data["embeddings"][top_k_indices]
    
    top_k_doc_ids = [embedding_data["doc_ids"][idx] for idx in top_k_indices]
    
    return top_k_embeddings, top_k_doc_ids

In [None]:
"""
Calculate Metrics with the top1000 BM25 passages.
"""

# Limit Queries
M = len(query_data['query_ids'])
passage_mrr_scores = []
document_mrr_scores = []
ndcg_scores = []
index_to_query_id = query_data['query_ids']

print("Start Similarity Calculation")
start = time.time()

for i, (query_id, query_embedding) in enumerate(zip(query_data['query_ids'], query_data['embeddings'])):
    if i >= M:
        break

    actual_query_id = index_to_query_id[i]

    top_k_passages = top_1000_docs[i]

    # Extract top k Embeddings
    top_k_embeddings, top_k_doc_ids = get_top_k_embeddings(top_k_passages, embedding_data)
    
    similarities = np.dot(query_embedding.reshape(1, -1), top_k_embeddings.T)[0]
    ranked_indices = np.argsort(similarities)[::-1][:20]  # Top 20 für NDCG@20
    
    ranked_passage_ids = [top_k_doc_ids[idx] for idx in ranked_indices]
    relevant_doc_ids = set(df_qrels[df_qrels['query_id'] == actual_query_id]['doc_id'])
    
    passage_mrr = calculate_mrr(ranked_passage_ids, relevant_doc_ids)
    passage_mrr_scores.append(passage_mrr)
    
    ndcg = calculate_ndcg(ranked_passage_ids, relevant_doc_ids)
    ndcg_scores.append(ndcg)
    
    doc_scores = {}
    for idx in ranked_indices:
        doc_id = top_k_doc_ids[idx]
        doc_scores[doc_id] = max(doc_scores.get(doc_id, 0), similarities[idx])
    
    ranked_doc_ids = sorted(doc_scores, key=doc_scores.get, reverse=True)
    document_mrr = calculate_mrr(ranked_doc_ids, relevant_doc_ids)
    document_mrr_scores.append(document_mrr)
    document_ndcg = calculate_ndcg(ranked_doc_ids, relevant_doc_ids)
    document_ndcg_scores.append(document_ndcg)

print(f"Duration for all Queries: {time.time() - start} seconds")
mean_passage_mrr = np.mean(passage_mrr_scores)
mean_document_mrr = np.mean(document_mrr_scores)
mean_ndcg = np.mean(ndcg_scores)
mean_document_ndcg = np.mean(document_ndcg_scores)

print(f"Passage MRR@10: {mean_passage_mrr:.4f}")
print(f"Document MRR@10: {mean_document_mrr:.4f}")
print(f"Passage NDCG@20: {mean_ndcg:.4f}")
print(f"Document NDCG@20: {mean_document_ndcg:.4f}")

In [None]:
"""
Ergebnisse

Baseline
Mean Passage MRR@10: 0.1901
Mean Document MRR@10: 0.1939
Mean NDCG@20: 0.1629

"""