# Initialising the code

In [12]:
import csv
from pathlib import Path
import numpy as np
from ranking import Ranking
from index import Mode, InMemoryIndex
from encoder import TCTColBERTQueryEncoder as TCTColBERTQueryEncoderFF
# from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from ir_measures import read_trec_qrels, calc_aggregate, nDCG, RR, P, R, AP, MAP
import sys
from tqdm import tqdm

# define values of eta to test 
etas = [10,20,30,40,50,60,70,80,90,100,500]

# define values of alpha to test
alphas = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,-1.0]

In [13]:
# in_memory_index = InMemoryIndex(mode=Mode.PASSAGE, encoder=TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco"))
in_memory_index = InMemoryIndex.from_disk('../testing/ffindex_passage_2019_2020.pkl')
in_memory_index.encoder = TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco")

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
sparse_ranking_2019 = Ranking.from_file(Path("../testing/msmarco-passage-test2019-sparse10000.txt"))
sparse_ranking_2019.cut(5000)
sparse_ranking_2019.name = "sparse_ranking_2019"
sparse_ranking_2020 = Ranking.from_file(Path("../testing/msmarco-passage-test2020-sparse10000.txt"))
sparse_ranking_2020.cut(5000)
sparse_ranking_2020.name = "sparse_ranking_2020"
all_ids = set.union(
    *[set(sparse_ranking_2019[q_id].keys()) for q_id in sparse_ranking_2019.q_ids],
    *[set(sparse_ranking_2020[q_id].keys()) for q_id in sparse_ranking_2020.q_ids]
)
print(f"indexing {len(all_ids)} documents or passages")

indexing 440079 documents or passages


## Testing the implemented retrieval functions

In [15]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

alpha = 0.2
result = in_memory_index.get_scores(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    normalise=False
)
result2 = in_memory_index.get_scores_rrf(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
)
result3 = in_memory_index.get_scores_rrf(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    eta=60,
    eta2=10,
)
qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))
print(
    "BM25",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, sparse_ranking_2019.run)
)
print(
    f"Interpolation",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result[alpha].run)
)
print(
    f"RRF",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result2[alpha].run)
)
print(
    f"rrf with eta_lex = 60 and eta_sem = 10",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result3[alpha].run)
)

loaded 200 queries
Encoding queries for interpolation...


100%|██████████| 43/43 [00:01<00:00, 40.40it/s]


sparse_ranking_2019 None
Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 63.23it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.05it/s]


BM25 {RR(rel=2)@10: 0.7024178663713547, nDCG@10: 0.5058310024399072}
Interpolation {RR(rel=2)@10: 0.901937984496124, nDCG@10: 0.7158066715626034}
RRF {RR(rel=2)@10: 0.8775193798449613, nDCG@10: 0.6816417471378418}
rrf with eta_lex = 60 and eta_sem = 10 {RR(rel=2)@10: 0.8418604651162791, nDCG@10: 0.6989395186704049}


# Validation on 2019 set

## Validation on the convex combination with two types of normalisation

In [None]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019 = {
    'ccUnnormalized': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
    'ccNormalized_minimax': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
'ccNormalized_zscore': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
}


for i in tqdm(range(len(etas))):
    alpha = alphas[i]
    eta = etas[i]

    result = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="none"
    )
    result2 = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="minimax"
    )
    result3 = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="zscore"
    )
    qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))

    interpolationRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result[alpha].run)
    fscoreInterpolationRes = (2*interpolationRes[P(rel=2)@10]*interpolationRes[R(rel=2)@10])/(interpolationRes[P(rel=2)@10]+interpolationRes[R(rel=2)@10])

    interpolationMinMax = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result2[alpha].run)
    fscoreInterpolationMinMax = (2*interpolationMinMax[P(rel=2)@10]*interpolationMinMax[R(rel=2)@10])/(interpolationMinMax[P(rel=2)@10]+interpolationMinMax[R(rel=2)@10])

    interpolationZscore = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result3[alpha].run)
    fscoreInterpolationZscore = (2*interpolationZscore[P(rel=2)@10]*interpolationZscore[R(rel=2)@10])/(interpolationZscore[P(rel=2)@10]+interpolationZscore[R(rel=2)@10])

    # update dictionary of best parameters per method
    if interpolationRes[P(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['precision']['value']:
        bestHyperparams2019['ccUnnormalized']['precision']['value'] = interpolationRes[P(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['precision']['alpha'] = alpha

    if interpolationRes[R(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['recall']['value']:
        bestHyperparams2019['ccUnnormalized']['recall']['value'] = interpolationRes[R(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha

    if interpolationRes[nDCG@10] > bestHyperparams2019['ccUnnormalized']['ndcg']['value']:
        bestHyperparams2019['ccUnnormalized']['ndcg']['value'] = interpolationRes[nDCG@10]
        bestHyperparams2019['ccUnnormalized']['ndcg']['alpha'] = alpha

    if interpolationRes[RR(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['rr']['value']:
        bestHyperparams2019['ccUnnormalized']['rr']['value'] = interpolationRes[RR(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['rr']['alpha'] = alpha

    if interpolationRes[AP(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['map']['value']:
        bestHyperparams2019['ccUnnormalized']['map']['value'] = interpolationRes[AP(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['map']['alpha'] = alpha
    
    if fscoreInterpolationRes > bestHyperparams2019['ccUnnormalized']['fscore']['value']:
        bestHyperparams2019['ccUnnormalized']['fscore']['value'] = fscoreInterpolationRes
        bestHyperparams2019['ccUnnormalized']['fscore']['alpha'] = alpha
    
    if interpolationMinMax[P(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['precision']['value']:
        bestHyperparams2019['ccNormalized_minimax']['precision']['value'] = interpolationMinMax[P(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['precision']['alpha'] = alpha

    if interpolationMinMax[R(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['recall']['value']:
        bestHyperparams2019['ccNormalized_minimax']['recall']['value'] = interpolationMinMax[R(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['recall']['alpha'] = alpha

    if interpolationMinMax[nDCG@10] > bestHyperparams2019['ccNormalized_minimax']['ndcg']['value']:
        bestHyperparams2019['ccNormalized_minimax']['ndcg']['value'] = interpolationMinMax[nDCG@10]
        bestHyperparams2019['ccNormalized_minimax']['ndcg']['alpha'] = alpha

    if interpolationMinMax[RR(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['rr']['value']:
        bestHyperparams2019['ccNormalized_minimax']['rr']['value'] = interpolationMinMax[RR(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['rr']['alpha'] = alpha
    
    if fscoreInterpolationMinMax > bestHyperparams2019['ccNormalized_minimax']['fscore']['value']:
        bestHyperparams2019['ccNormalized_minimax']['fscore']['value'] = fscoreInterpolationMinMax
        bestHyperparams2019['ccNormalized_minimax']['fscore']['alpha'] = alpha
    
    if interpolationMinMax[AP(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['map']['value']:
        bestHyperparams2019['ccNormalized_minimax']['map']['value'] = interpolationMinMax[AP(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['map']['alpha'] = alpha

    if interpolationMinMax[P(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['precision']['value']:
        bestHyperparams2019['ccNormalized_zscore']['precision']['value'] = interpolationZscore[P(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['precision']['alpha'] = alpha

    if interpolationZscore[R(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['recall']['value']:
        bestHyperparams2019['ccNormalized_zscore']['recall']['value'] =interpolationZscore[R(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['recall']['alpha'] = alpha

    if interpolationZscore[nDCG@10] > bestHyperparams2019['ccNormalized_zscore']['ndcg']['value']:
        bestHyperparams2019['ccNormalized_zscore']['ndcg']['value'] =interpolationZscore[nDCG@10]
        bestHyperparams2019['ccNormalized_zscore']['ndcg']['alpha'] = alpha

    if interpolationZscore[RR(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['rr']['value']:
        bestHyperparams2019['ccNormalized_zscore']['rr']['value'] =interpolationZscore[RR(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['rr']['alpha'] = alpha
    
    if fscoreInterpolationZscore > bestHyperparams2019['ccNormalized_zscore']['fscore']['value']:
        bestHyperparams2019['ccNormalized_zscore']['fscore']['value'] = fscoreInterpolationZscore
        bestHyperparams2019['ccNormalized_zscore']['fscore']['alpha'] = alpha
    
    if interpolationZscore[AP(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['map']['value']:
        bestHyperparams2019['ccNormalized_zscore']['map']['value'] =interpolationZscore[AP(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['map']['alpha'] = alpha

In [16]:
print(bestHyperparams2019['ccUnnormalized'])
print(bestHyperparams2019['ccNormalized_minimax'])
print(bestHyperparams2019['ccNormalized_zscore'])

{'precision': {'value': 0.6279069767441862, 'alpha': 0.1}, 'recall': {'value': 0.27300662836997697, 'alpha': 0.2}, 'fscore': {'value': 0.37969451880260424, 'alpha': 0.2}, 'rr': {'value': 0.901937984496124, 'alpha': 0.2}, 'ndcg': {'value': 0.7165204423861107, 'alpha': 0.1}, 'map': {'value': 0.21189662633253067, 'alpha': 0.1}}
{'precision': {'value': 0.6348837209302324, 'alpha': 0.2}, 'recall': {'value': 0.27191141589433454, 'alpha': 0.1}, 'fscore': {'value': 0.38033293287532133, 'alpha': 0.1}, 'rr': {'value': 0.9085271317829459, 'alpha': 0.3}, 'ndcg': {'value': 0.7229942609361281, 'alpha': 0.2}, 'map': {'value': 0.2130699577564767, 'alpha': 0.2}}
{'precision': {'value': 0.6372093023255815, 'alpha': 0.1}, 'recall': {'value': 0.2730182100148534, 'alpha': 0.1}, 'fscore': {'value': 0.38225551473042546, 'alpha': 0.1}, 'rr': {'value': 0.8988372093023256, 'alpha': 0.3}, 'ndcg': {'value': 0.7209484744376639, 'alpha': 0.2}, 'map': {'value': 0.2155405983113744, 'alpha': 0.1}}


## Validation of the RRF


In [None]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019_rrf = {
    'rrf': {'precision': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'recall': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'fscore': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'rr': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'ndcg': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'map': {'value': -1, 'eta1': 0, 'eta2': 0}},
}

explored = set()

for i in tqdm(range(len(etas))):
    for j in range(len(etas)):
        if (etas[i],etas[j]) in explored:
            continue
        else:
            explored.add((etas[i],etas[j]))
            eta = etas[i]

        result2 = in_memory_index.get_scores_rrf(
            sparse_ranking_2019,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = etas[i],
            eta2 = etas[j],
        )

        qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))

        rrfRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result2[alpha].run)
        fscoreRRRes = (2*rrfRes[P(rel=2)@10]*rrfRes[R(rel=2)@10])/(rrfRes[P(rel=2)@10]+rrfRes[R(rel=2)@10])


        if rrfRes[P(rel=2)@10] > bestHyperparams2019_rrf['rrf']['precision']['value']:
            bestHyperparams2019_rrf['rrf']['precision']['value'] = rrfRes[P(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['precision']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['precision']['eta2'] = etas[j]


        if rrfRes[R(rel=2)@10] > bestHyperparams2019_rrf['rrf']['recall']['value']:
            bestHyperparams2019_rrf['rrf']['recall']['value'] = rrfRes[R(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['recall']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['recall']['eta2'] = etas[j]


        if rrfRes[nDCG@10] > bestHyperparams2019_rrf['rrf']['ndcg']['value']:
            bestHyperparams2019_rrf['rrf']['ndcg']['value'] = rrfRes[nDCG@10]
            bestHyperparams2019_rrf['rrf']['ndcg']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['ndcg']['eta2'] = etas[j]

        if rrfRes[RR(rel=2)@10] > bestHyperparams2019_rrf['rrf']['rr']['value']:
            bestHyperparams2019_rrf['rrf']['rr']['value'] = rrfRes[RR(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['rr']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['rr']['eta2'] = etas[j]

        if fscoreRRRes > bestHyperparams2019_rrf['rrf']['fscore']['value']:
            bestHyperparams2019_rrf['rrf']['fscore']['value'] = fscoreRRRes
            bestHyperparams2019_rrf['rrf']['fscore']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['fscore']['eta2'] = etas[j]

        if rrfRes[AP(rel=2)@10] > bestHyperparams2019_rrf['rrf']['map']['value']:
            bestHyperparams2019_rrf['rrf']['map']['value'] = rrfRes[AP(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['map']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['map']['eta2'] = etas[j]

In [17]:
for key in bestHyperparams2019_rrf['rrf']:
    print(key, bestHyperparams2019_rrf['rrf'][key])

precision {'value': 0.6348837209302326, 'eta1': 40, 'eta2': 10}
recall {'value': 0.26976473421199393, 'eta1': 40, 'eta2': 10}
fscore {'value': 0.37864263683585114, 'eta1': 40, 'eta2': 10}
rr {'value': 0.897674418604651, 'eta1': 20, 'eta2': 20}
ndcg {'value': 0.7151248552748896, 'eta1': 80, 'eta2': 40}
map {'value': 0.20941181623477223, 'eta1': 100, 'eta2': 10}


# Testing hyperparameters found from validation on 2020 set

## Testing the convex combination parameters


In [19]:
with open(
    "../testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")


for key in bestHyperparams2019.keys():
    print('\n'+'TESTING RESULTS FOR BEST VALIDATION METRICS FOR {}'.format(key))
    for metric in bestHyperparams2019[key].keys():
        # print(f"best {metric} for {key} is {bestHyperparams2019[key][metric]['value']} with alpha {bestHyperparams2019[key][metric]['alpha']}")
    
        alpha = bestHyperparams2019[key][metric]['alpha']

        result = in_memory_index.get_scores(
            sparse_ranking_2020,
            queries,
            alpha=alpha*1.0,
            cutoff=None,
            early_stopping=False,
            normalise= False if key == 'ccUnnormalized' else True
        )
        qrels = list(read_trec_qrels("../testing/2020qrels-pass.txt"))
        print('using best alpha={} for {}'.format(alpha, metric))
        interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result[alpha].run)
        fscoreInterpolation2 = (2*interpolation2[P(rel=2)@10]*interpolation2[R(rel=2)@10])/(interpolation2[P(rel=2)@10]+interpolation2[R(rel=2)@10])
        print(
            f"Interpolation",
            interpolation2,
            'f-score:{}'.format(fscoreInterpolation2)
        )

loaded 200 queries

TESTING RESULTS FOR BEST VALIDATION METRICS FOR ccUnnormalized
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 66.57it/s]


sparse_ranking_2020 None
using best alpha=0.1 for precision
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 54.56it/s]


sparse_ranking_2020 None
using best alpha=0.2 for recall
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 60.35it/s]


sparse_ranking_2020 None
using best alpha=0.2 for fscore
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 59.17it/s]


sparse_ranking_2020 None
using best alpha=0.2 for rr
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:01<00:00, 51.02it/s]


sparse_ranking_2020 None
using best alpha=0.1 for ndcg
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 60.29it/s]


sparse_ranking_2020 None
using best alpha=0.1 for map
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673

TESTING RESULTS FOR BEST VALIDATION METRICS FOR ccNormalized_minimax
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 65.00it/s]


sparse_ranking_2020 None
using best alpha=0.2 for precision
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:01<00:00, 49.60it/s]


sparse_ranking_2020 None
using best alpha=0.1 for recall
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 77.96it/s]


sparse_ranking_2020 None
using best alpha=0.1 for fscore
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 95.27it/s] 


sparse_ranking_2020 None
using best alpha=0.3 for rr
Interpolation {R(rel=2)@10: 0.35446072456610184, P(rel=2)@10: 0.4870370370370371, AP(rel=2)@10: 0.2666838650083592, RR(rel=2)@10: 0.7861111111111113, nDCG@10: 0.6418605413650978} f-score:0.41030531254126534
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 63.39it/s]


sparse_ranking_2020 None
using best alpha=0.2 for ndcg
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 56.72it/s]


sparse_ranking_2020 None
using best alpha=0.2 for map
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391

TESTING RESULTS FOR BEST VALIDATION METRICS FOR ccNormalized_zscore
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 55.98it/s]


sparse_ranking_2020 None
using best alpha=0.1 for precision
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 78.38it/s]


sparse_ranking_2020 None
using best alpha=0.1 for recall
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 57.28it/s]


sparse_ranking_2020 None
using best alpha=0.1 for fscore
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 70.96it/s]


sparse_ranking_2020 None
using best alpha=0.3 for rr
Interpolation {R(rel=2)@10: 0.35446072456610184, P(rel=2)@10: 0.4870370370370371, AP(rel=2)@10: 0.2666838650083592, RR(rel=2)@10: 0.7861111111111113, nDCG@10: 0.6418605413650978} f-score:0.41030531254126534
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 63.97it/s]


sparse_ranking_2020 None
using best alpha=0.2 for ndcg
Interpolation {R(rel=2)@10: 0.373499948435731, P(rel=2)@10: 0.5203703703703703, AP(rel=2)@10: 0.28764618362522143, RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268857} f-score:0.4348691357386391
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 62.19it/s]


sparse_ranking_2020 None
using best alpha=0.1 for map
Interpolation {R(rel=2)@10: 0.39120794049318314, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.29804239556403656, RR(rel=2)@10: 0.8148148148148148, nDCG@10: 0.6939603772252908} f-score:0.4513465019601673


## Testing RRF parameters

In [18]:
with open(
    "../testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")


for key in bestHyperparams2019_rrf.keys():
    print('\n'+'TESTING RESULTS FOR BEST VALIDATION METRICS FOR {}'.format('normalized' if key == 'ccNormalized' else 'unnormalized'))
    for metric in bestHyperparams2019_rrf[key].keys():
        # print(f"best {metric} for {key} is {bestHyperparams2019[key][metric]['value']} with alpha {bestHyperparams2019[key][metric]['alpha']}")
        eta1 = bestHyperparams2019_rrf[key][metric]['eta1']
        eta2 = bestHyperparams2019_rrf[key][metric]['eta2']
        alpha = 0.0
        result = in_memory_index.get_scores_rrf(
            sparse_ranking_2020,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = eta1,
            eta2 = eta2
        )
        qrels = list(read_trec_qrels("../testing/2020qrels-pass.txt"))
        print('using best eta1={} and eta2={} for {}'.format(eta1, eta2, metric))
        interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result[alpha].run)
        fscoreInterpolation2 = (2*interpolation2[P(rel=2)@10]*interpolation2[R(rel=2)@10])/(interpolation2[P(rel=2)@10]+interpolation2[R(rel=2)@10])
        print(
            f"RRF",
            interpolation2,
            'f-score:{}'.format(fscoreInterpolation2)
        )

loaded 200 queries

TESTING RESULTS FOR BEST VALIDATION METRICS FOR unnormalized
Encoding queries for RRF...


100%|██████████| 54/54 [00:01<00:00, 36.51it/s]


using best eta1=40 and eta2=10 for precision
RRF {R(rel=2)@10: 0.3702476168178881, P(rel=2)@10: 0.5037037037037038, AP(rel=2)@10: 0.2851841737423464, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705971} f-score:0.4267860039786561
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 59.93it/s]


using best eta1=40 and eta2=10 for recall
RRF {R(rel=2)@10: 0.3702476168178881, P(rel=2)@10: 0.5037037037037038, AP(rel=2)@10: 0.2851841737423464, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705971} f-score:0.4267860039786561
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 62.24it/s]


using best eta1=40 and eta2=10 for fscore
RRF {R(rel=2)@10: 0.3702476168178881, P(rel=2)@10: 0.5037037037037038, AP(rel=2)@10: 0.2851841737423464, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705971} f-score:0.4267860039786561
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 67.40it/s] 


using best eta1=20 and eta2=20 for rr
RRF {R(rel=2)@10: 0.3525888949678007, P(rel=2)@10: 0.4814814814814814, AP(rel=2)@10: 0.26808911791402357, RR(rel=2)@10: 0.809104938271605, nDCG@10: 0.6395099489434912} f-score:0.4070760173157598
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 58.52it/s]


using best eta1=80 and eta2=40 for ndcg
RRF {R(rel=2)@10: 0.3707075753764424, P(rel=2)@10: 0.5037037037037037, AP(rel=2)@10: 0.28120310355515377, RR(rel=2)@10: 0.807716049382716, nDCG@10: 0.6639475222194774} f-score:0.4270914229390197
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 63.86it/s]


using best eta1=100 and eta2=10 for map
RRF {R(rel=2)@10: 0.368343713885258, P(rel=2)@10: 0.5111111111111111, AP(rel=2)@10: 0.29184181741782134, RR(rel=2)@10: 0.8154320987654321, nDCG@10: 0.6829413068342426} f-score:0.4281392506442038
