In [1]:
import numpy as np
import random
use_metrics = ["map", "P_5", "P_10", "P_20", "ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20"]


In [2]:
def randomization_test(l_target, l_base):
    total_test = 5000
    diff = sum(l_target) / float(len(l_target)) - sum(l_base) / float(len(l_base))
    cnt = 0.0
    for i in range(total_test):
        l_a, l_b = random_swap(l_target, l_base)
        this_diff = sum(l_a) / float(len(l_a)) - sum(l_b) / float(len(l_b))
        if this_diff > diff:
            cnt += 1.0
    p = cnt / float(total_test)
    return p


def random_swap(l_target, l_base):
    l_a = list(l_target)
    l_b = list(l_base)

    for i in range(len(l_target)):
        if random.randint(0, 1):
            l_a[i], l_b[i] = l_b[i],l_a[i]
    return l_a, l_b


def win_tie_loss(l_target, l_base):
    assert len(l_target) == len(l_base)
    win, tie, loss = 0, 0, 0
    l_a = [round(a, 3) for a in l_target]
    l_b = [round(b, 3) for b in l_base]
    l_ab = list(zip(l_a, l_b))
    win = sum([int(a > b) for a, b in l_ab])
    tie = sum([int(a == b) for a, b in l_ab])
    loss = sum([int(a < b) for a, b in l_ab])
    assert win + tie + loss == len(l_target)

    return win, tie, loss


In [3]:
def read_trec_qeval(file_path):
    per_query_results = {}
    average_results = {}
    for line in open(file_path):
        metric, qid, score = line.strip().split('\t')
        metric = metric.strip()
        if metric not in use_metrics:
            continue
        if 'all' in qid:
            average_results[metric] = float(score)
        else:
            qid = int(qid)
            if qid not in per_query_results:
                per_query_results[qid] = {}
            per_query_results[qid][metric] = float(score)
    return per_query_results, average_results
        
    

In [25]:
per_query_results_run1, average_results_run1 = read_trec_qeval("/tmp2/results/clueweb09/bing5120k-title-first200.trec.qeval")

In [26]:
per_query_results_run2, average_results_run2 = read_trec_qeval("/tmp2/results/clueweb09/bing2560k-title-first200.trec.qeval")

In [27]:
qids = set(per_query_results_run1.keys()) & set(per_query_results_run2.keys())

In [28]:

for metric in use_metrics:
    l_scores_run1 = [per_query_results_run1[qid][metric] for qid in qids]
    l_scores_run2 = [per_query_results_run2[qid][metric] for qid in qids]

    p = randomization_test(l_scores_run1, l_scores_run2)
    win, tie, loss = win_tie_loss(l_scores_run1, l_scores_run2)

    print(metric, round(np.mean(l_scores_run1),3), round(np.mean(l_scores_run2),3), p, win, tie, loss)


map 0.18 0.188 1.0 64 31 103
P_5 0.433 0.466 0.9918 26 132 40
P_10 0.41 0.433 0.9936 40 106 52
P_20 0.357 0.372 0.9954 34 109 55
ndcg_cut_5 0.319 0.349 0.9948 37 97 64
ndcg_cut_10 0.317 0.342 0.9958 49 67 82
ndcg_cut_20 0.314 0.333 1.0 57 43 98
