<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Make sure to have pytrec_eval installed and working

In [2]:
import pytrec_eval
from metrics_eval import utils
from metrics_eval import ranking_metrics
import numpy as np
import random
from math import isclose
from time import time

In [3]:
# Wrapper for pytrec_eval cut functions
def run_trec_function_cut(y_true, y_pred, metric, metric_k):
    evaluator = pytrec_eval.RelevanceEvaluator(y_true, {metric})
    results = evaluator.evaluate(y_pred)
    scores = [results[k][metric_k] for k in results.keys()]
    return np.sum(scores) / len(scores)

In [4]:
# Dataset generators
def convert_to_binary(y_true):
    try:
        return utils.to_typed_list([y[:, 0] for y in y_true])
    except:
        return [y[:, 0] for y in y_true]


def generate_y_true_dict(query_count, max_relevant_per_query):
    y_true = {}
    for i in range(query_count):
        y_t = {}
        for j in range(random.choice(range(1, max_relevant_per_query))):

            if j == 0:
                y_t["d{j}".format(j=j)] = 1
            else:
                y_t["d{j}".format(j=j)] = random.choice([0, 1])

        y_true["q{i}".format(i=i)] = y_t

    return y_true


def generate_y_pred_dict(query_count, result_count):
    y_pred = {}
    for i in range(query_count):
        y_p = {}
        for j in range(result_count):
            y_p["d{j}".format(j=j)] = random.uniform(0.0, 1.0)

        y_pred["q{i}".format(i=i)] = y_p

    return y_pred

In [5]:
def compare_ndcg(trec_eval_y_true, trec_eval_y_pred, metrics_eval_y_true, metrics_eval_y_pred, k):
    # pytrec_eval ------------------------------------------------------------
    start_time = time()
    trec_score = run_trec_function_cut(
        trec_eval_y_true, trec_eval_y_pred, "ndcg_cut", "ndcg_cut_" + str(k)
    )
    expired_time = time() - start_time
    print("pytrec_eval took: {expired_time}".format(expired_time=expired_time))
    
    # metrics_eval ------------------------------------------------------------
    # run metrics_eval once for compilation
    ranking_metrics.ndcg(metrics_eval_y_true, metrics_eval_y_pred, k, None, binary=True)
    start_time = time()
    score = ranking_metrics.ndcg(metrics_eval_y_true, metrics_eval_y_pred, k, None, binary=True)
    expired_time = time() - start_time
    print("metrics_eval took: {expired_time}".format(expired_time=expired_time))
    
    # Check equality -----------------------------------------------------------
    print("equality: {x}".format(x=isclose(trec_score, score, rel_tol=0.0001)))

In [6]:
random.seed(666)

query_count = 100000
result_count = 100
max_relevant_per_query = 100
k = 10

# trec_eval uses dictionaries
trec_eval_y_true = generate_y_true_dict(query_count, max_relevant_per_query)
trec_eval_y_pred = generate_y_pred_dict(query_count, result_count)

# metrics_eval uses numpy arrays and Numba data structures
metrics_eval_y_true = utils.convert_trec_y_true(trec_eval_y_true)
metrics_eval_y_pred = utils.convert_trec_y_pred(trec_eval_y_pred)

compare_ndcg(trec_eval_y_true, trec_eval_y_pred, metrics_eval_y_true, metrics_eval_y_pred, k)

pytrec_eval took: 3.180417060852051
metrics_eval took: 0.05454683303833008
equality: True
