# Initialising the code

In [2]:
import csv
from pathlib import Path
import numpy as np
from ranking import Ranking
from index import Mode, InMemoryIndex
from encoder import TCTColBERTQueryEncoder as TCTColBERTQueryEncoderFF
# from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from ir_measures import read_trec_qrels, calc_aggregate, nDCG, RR, P, R, AP, MAP
import sys
from tqdm import tqdm

# define values of eta to test
etas = [10,20,30,40,50,60,70,80,90,100,500]

# define values of alpha to test
alphas = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,-1.0]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# in_memory_index = InMemoryIndex(mode=Mode.PASSAGE, encoder=TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco"))
in_memory_index = InMemoryIndex.from_disk('../testing/ffindex_passage_2019_2020.pkl')
in_memory_index.encoder = TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco")

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
sparse_ranking_2019 = Ranking.from_file(Path("../testing/msmarco-passage-test2019-sparse10000.txt"))
sparse_ranking_2019.cut(5000)
sparse_ranking_2019.name = "sparse_ranking_2019"
sparse_ranking_2020 = Ranking.from_file(Path("../testing/msmarco-passage-test2020-sparse10000.txt"))
sparse_ranking_2020.cut(5000)
sparse_ranking_2020.name = "sparse_ranking_2020"
all_ids = set.union(
    *[set(sparse_ranking_2019[q_id].keys()) for q_id in sparse_ranking_2019.q_ids],
    *[set(sparse_ranking_2020[q_id].keys()) for q_id in sparse_ranking_2020.q_ids]
)
print(f"indexing {len(all_ids)} documents or passages")

indexing 440079 documents or passages


In [5]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

alpha = 0.2
result = in_memory_index.get_scores(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    normalise=False
)
result2 = in_memory_index.get_scores_rrf(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
)
result3 = in_memory_index.get_scores_rrf(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    eta=60,
    eta2=10,
)
qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))
print(
    "BM25",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, sparse_ranking_2019.run)
)
print(
    f"Interpolation",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result[alpha].run)
)
print(
    f"RRF",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result2[alpha].run)
)
print(
    f"rrf with eta_lex = 60 and eta_sem = 10",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result3[alpha].run)
)

loaded 200 queries
Encoding queries for interpolation...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 43/43 [00:01<00:00, 37.61it/s]


sparse_ranking_2019 None
Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.21it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.14it/s]


BM25 {RR(rel=2)@10: 0.7024178663713547, nDCG@10: 0.5058310024399072}
Interpolation {RR(rel=2)@10: 0.9019379844961242, nDCG@10: 0.7158066715626032}
RRF {RR(rel=2)@10: 0.8775193798449613, nDCG@10: 0.6816417471378418}
rrf with eta_lex = 60 and eta_sem = 10 {RR(rel=2)@10: 0.8418604651162791, nDCG@10: 0.6989395186704048}


# Validation on 2019 set

## Validation on the convex combination with two types of normalisation

In [9]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019 = {
    'ccUnnormalized': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
    'ccNormalized_minimax': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
'ccNormalized_zscore': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 
'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}, 'map': {'value': -1, 'alpha': 0}},
}


for i in tqdm(range(len(etas))):
    alpha = alphas[i]
    eta = etas[i]

    result = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="none"
    )
    result2 = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="minimax"
    )
    result3 = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise="zscore"
    )
    qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))

    interpolationRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result[alpha].run)
    fscoreInterpolationRes = (2*interpolationRes[P(rel=2)@10]*interpolationRes[R(rel=2)@10])/(interpolationRes[P(rel=2)@10]+interpolationRes[R(rel=2)@10])

    interpolationMinMax = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result2[alpha].run)
    fscoreInterpolationMinMax = (2*interpolationMinMax[P(rel=2)@10]*interpolationMinMax[R(rel=2)@10])/(interpolationMinMax[P(rel=2)@10]+interpolationMinMax[R(rel=2)@10])

    interpolationZscore = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10, AP(rel=2)@10], qrels, result3[alpha].run)
    fscoreInterpolationZscore = (2*interpolationZscore[P(rel=2)@10]*interpolationZscore[R(rel=2)@10])/(interpolationZscore[P(rel=2)@10]+interpolationZscore[R(rel=2)@10])

    # update dictionary of best parameters per method
    if interpolationRes[P(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['precision']['value']:
        bestHyperparams2019['ccUnnormalized']['precision']['value'] = interpolationRes[P(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['precision']['alpha'] = alpha

    if interpolationRes[R(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['recall']['value']:
        bestHyperparams2019['ccUnnormalized']['recall']['value'] = interpolationRes[R(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha

    if interpolationRes[nDCG@10] > bestHyperparams2019['ccUnnormalized']['ndcg']['value']:
        bestHyperparams2019['ccUnnormalized']['ndcg']['value'] = interpolationRes[nDCG@10]
        bestHyperparams2019['ccUnnormalized']['ndcg']['alpha'] = alpha

    if interpolationRes[RR(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['rr']['value']:
        bestHyperparams2019['ccUnnormalized']['rr']['value'] = interpolationRes[RR(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['rr']['alpha'] = alpha

    if interpolationRes[AP(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['map']['value']:
        bestHyperparams2019['ccUnnormalized']['map']['value'] = interpolationRes[AP(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['map']['alpha'] = alpha
    
    if fscoreInterpolationRes > bestHyperparams2019['ccUnnormalized']['fscore']['value']:
        bestHyperparams2019['ccUnnormalized']['fscore']['value'] = fscoreInterpolationRes
        bestHyperparams2019['ccUnnormalized']['fscore']['alpha'] = alpha
    
    if interpolationMinMax[P(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['precision']['value']:
        bestHyperparams2019['ccNormalized_minimax']['precision']['value'] = interpolationMinMax[P(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['precision']['alpha'] = alpha

    if interpolationMinMax[R(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['recall']['value']:
        bestHyperparams2019['ccNormalized_minimax']['recall']['value'] = interpolationMinMax[R(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['recall']['alpha'] = alpha

    if interpolationMinMax[nDCG@10] > bestHyperparams2019['ccNormalized_minimax']['ndcg']['value']:
        bestHyperparams2019['ccNormalized_minimax']['ndcg']['value'] = interpolationMinMax[nDCG@10]
        bestHyperparams2019['ccNormalized_minimax']['ndcg']['alpha'] = alpha

    if interpolationMinMax[RR(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['rr']['value']:
        bestHyperparams2019['ccNormalized_minimax']['rr']['value'] = interpolationMinMax[RR(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['rr']['alpha'] = alpha
    
    if fscoreInterpolationMinMax > bestHyperparams2019['ccNormalized_minimax']['fscore']['value']:
        bestHyperparams2019['ccNormalized_minimax']['fscore']['value'] = fscoreInterpolationMinMax
        bestHyperparams2019['ccNormalized_minimax']['fscore']['alpha'] = alpha
    
    if interpolationMinMax[AP(rel=2)@10] > bestHyperparams2019['ccNormalized_minimax']['map']['value']:
        bestHyperparams2019['ccNormalized_minimax']['map']['value'] = interpolationMinMax[AP(rel=2)@10]
        bestHyperparams2019['ccNormalized_minimax']['map']['alpha'] = alpha

    if interpolationMinMax[P(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['precision']['value']:
        bestHyperparams2019['ccNormalized_zscore']['precision']['value'] = interpolationZscore[P(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['precision']['alpha'] = alpha

    if interpolationZscore[R(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['recall']['value']:
        bestHyperparams2019['ccNormalized_zscore']['recall']['value'] =interpolationZscore[R(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['recall']['alpha'] = alpha

    if interpolationZscore[nDCG@10] > bestHyperparams2019['ccNormalized_zscore']['ndcg']['value']:
        bestHyperparams2019['ccNormalized_zscore']['ndcg']['value'] =interpolationZscore[nDCG@10]
        bestHyperparams2019['ccNormalized_zscore']['ndcg']['alpha'] = alpha

    if interpolationZscore[RR(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['rr']['value']:
        bestHyperparams2019['ccNormalized_zscore']['rr']['value'] =interpolationZscore[RR(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['rr']['alpha'] = alpha
    
    if fscoreInterpolationZscore > bestHyperparams2019['ccNormalized_zscore']['fscore']['value']:
        bestHyperparams2019['ccNormalized_zscore']['fscore']['value'] = fscoreInterpolationZscore
        bestHyperparams2019['ccNormalized_zscore']['fscore']['alpha'] = alpha
    
    if interpolationZscore[AP(rel=2)@10] > bestHyperparams2019['ccNormalized_zscore']['map']['value']:
        bestHyperparams2019['ccNormalized_zscore']['map']['value'] =interpolationZscore[AP(rel=2)@10]
        bestHyperparams2019['ccNormalized_zscore']['map']['alpha'] = alpha


loaded 200 queries


  0%|          | 0/11 [00:00<?, ?it/s]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 57.43it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 67.36it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 78.01it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


  9%|▉         | 1/11 [00:06<01:07,  6.78s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 85.62it/s] 


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 67.83it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 79.95it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 18%|█▊        | 2/11 [00:12<00:57,  6.39s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 61.67it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 85.66it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 78.63it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 27%|██▋       | 3/11 [00:19<00:50,  6.31s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 70.11it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 72.68it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 74.76it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 36%|███▋      | 4/11 [00:25<00:43,  6.25s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 67.00it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 66.09it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 79.47it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 45%|████▌     | 5/11 [00:31<00:37,  6.32s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 86.95it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 50.18it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 61.61it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 55%|█████▍    | 6/11 [00:38<00:31,  6.40s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 75.35it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 60.46it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 48.14it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 64%|██████▎   | 7/11 [00:44<00:25,  6.47s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 49.05it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 82.15it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 57.78it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 73%|███████▎  | 8/11 [00:51<00:19,  6.53s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 80.57it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 73.30it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 83.10it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 82%|████████▏ | 9/11 [00:57<00:12,  6.44s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 70.98it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 76.20it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 64.63it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


 91%|█████████ | 10/11 [01:04<00:06,  6.39s/it]

Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 61.65it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 58.40it/s]


minimax normalization done
minimax normalization done
sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 72.05it/s]


z-score normalization done
z-score normalization done
sparse_ranking_2019 None


100%|██████████| 11/11 [01:10<00:00,  6.41s/it]


In [10]:
print(bestHyperparams2019['ccUnnormalized'])
print(bestHyperparams2019['ccNormalized_minimax'])
print(bestHyperparams2019['ccNormalized_zscore'])

{'precision': {'value': 0.627906976744186, 'alpha': 0.1}, 'recall': {'value': 0.27300662836997697, 'alpha': 0.2}, 'fscore': {'value': 0.37969451880260424, 'alpha': 0.2}, 'rr': {'value': 0.901937984496124, 'alpha': 0.2}, 'ndcg': {'value': 0.7165204423861109, 'alpha': 0.1}, 'map': {'value': 0.21189662633253067, 'alpha': 0.1}}
{'precision': {'value': 0.6348837209302325, 'alpha': 0.2}, 'recall': {'value': 0.2719114158943346, 'alpha': 0.1}, 'fscore': {'value': 0.38033293287532133, 'alpha': 0.1}, 'rr': {'value': 0.9085271317829456, 'alpha': 0.3}, 'ndcg': {'value': 0.7229942609361284, 'alpha': 0.2}, 'map': {'value': 0.21306995775647664, 'alpha': 0.2}}
{'precision': {'value': 0.6372093023255815, 'alpha': 0.1}, 'recall': {'value': 0.2730182100148534, 'alpha': 0.1}, 'fscore': {'value': 0.38225551473042546, 'alpha': 0.1}, 'rr': {'value': 0.8988372093023256, 'alpha': 0.3}, 'ndcg': {'value': 0.7209484744376641, 'alpha': 0.2}, 'map': {'value': 0.2155405983113744, 'alpha': 0.1}}


## Validation of the RRF 

In [14]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019_rrf = {
    'rrf': {'precision': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'recall': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'fscore': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'rr': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'ndcg': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'map': {'value': -1, 'eta1': 0, 'eta2': 0}},
}

explored = set()

for i in tqdm(range(len(etas))):
    for j in range(len(etas)):
        if (etas[i],etas[j]) in explored:
            continue
        else:
            explored.add((etas[i],etas[j]))
            eta = etas[i]

        result2 = in_memory_index.get_scores_rrf(
            sparse_ranking_2019,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = etas[i],
            eta2 = etas[j],
        )

        qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))

        rrfRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result2[alpha].run)
        fscoreRRRes = (2*rrfRes[P(rel=2)@10]*rrfRes[R(rel=2)@10])/(rrfRes[P(rel=2)@10]+rrfRes[R(rel=2)@10])


        if rrfRes[P(rel=2)@10] > bestHyperparams2019_rrf['rrf']['precision']['value']:
            bestHyperparams2019_rrf['rrf']['precision']['value'] = rrfRes[P(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['precision']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['precision']['eta2'] = etas[j]


        if rrfRes[R(rel=2)@10] > bestHyperparams2019_rrf['rrf']['recall']['value']:
            bestHyperparams2019_rrf['rrf']['recall']['value'] = rrfRes[R(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['recall']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['recall']['eta2'] = etas[j]


        if rrfRes[nDCG@10] > bestHyperparams2019_rrf['rrf']['ndcg']['value']:
            bestHyperparams2019_rrf['rrf']['ndcg']['value'] = rrfRes[nDCG@10]
            bestHyperparams2019_rrf['rrf']['ndcg']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['ndcg']['eta2'] = etas[j]

        if rrfRes[RR(rel=2)@10] > bestHyperparams2019_rrf['rrf']['rr']['value']:
            bestHyperparams2019_rrf['rrf']['rr']['value'] = rrfRes[RR(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['rr']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['rr']['eta2'] = etas[j]

        if fscoreRRRes > bestHyperparams2019_rrf['rrf']['fscore']['value']:
            bestHyperparams2019_rrf['rrf']['fscore']['value'] = fscoreRRRes
            bestHyperparams2019_rrf['rrf']['fscore']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['fscore']['eta2'] = etas[j]

        if rrfRes[AP(rel=2)@10] > bestHyperparams2019_rrf['rrf']['map']['value']:
            bestHyperparams2019_rrf['rrf']['map']['value'] = rrfRes[AP(rel=2)@10]
            bestHyperparams2019_rrf['rrf']['map']['eta1'] = etas[i]
            bestHyperparams2019_rrf['rrf']['map']['eta2'] = etas[j]

loaded 200 queries


  0%|          | 0/11 [00:00<?, ?it/s]

Encoding queries for RRF...


100%|██████████| 43/43 [00:01<00:00, 28.53it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 70.13it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 94.31it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 64.69it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 43.51it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.23it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.78it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 81.53it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.24it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 72.49it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.30it/s]
  9%|▉         | 1/11 [00:22<03:41, 22.11s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.16it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.90it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 49.31it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.22it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 81.59it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 64.49it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.69it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.45it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.55it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.02it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 91.42it/s]
 18%|█▊        | 2/11 [00:42<03:07, 20.83s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 87.46it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.49it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.65it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 59.63it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.83it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.14it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.37it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.62it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.06it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.28it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 66.95it/s]
 27%|██▋       | 3/11 [01:02<02:46, 20.78s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.74it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.15it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.40it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.56it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.70it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 95.25it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.32it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.52it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.15it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.94it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.68it/s]
 36%|███▋      | 4/11 [01:22<02:21, 20.21s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.83it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.32it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 64.09it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 90.04it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.93it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.63it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 64.87it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.49it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.38it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.38it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 55.18it/s]
 45%|████▌     | 5/11 [01:42<02:02, 20.36s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 43.56it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 94.23it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.59it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.05it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 73.12it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.37it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.41it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.82it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 77.12it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 81.27it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.17it/s]
 55%|█████▍    | 6/11 [02:03<01:43, 20.62s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.11it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.51it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.21it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.72it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.03it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.74it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.95it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.87it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 70.16it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.54it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 56.23it/s]
 64%|██████▎   | 7/11 [02:24<01:22, 20.72s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 53.93it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 60.27it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.61it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.05it/s]


Encoding queries for RRF...



 47%|████▋     | 20/43 [00:00<00:00, 81.12it/s]

In [18]:
for key in bestHyperparams2019_rrf['rrf']:
    print(key, bestHyperparams2019_rrf['rrf'][key])

{'precision': {'value': 0.6348837209302325, 'eta1': 40, 'eta2': 10}, 'recall': {'value': 0.26976473421199393, 'eta1': 40, 'eta2': 10}, 'fscore': {'value': 0.3786426368358511, 'eta1': 40, 'eta2': 10}, 'rr': {'value': 0.8976744186046514, 'eta1': 20, 'eta2': 20}, 'ndcg': {'value': 0.7151248552748896, 'eta1': 80, 'eta2': 40}, 'map': {'value': 0.20941181623477217, 'eta1': 100, 'eta2': 10}}


# Testing hyperparameters found from validation on 2020 set

## Testing the convex combination parameters

In [28]:
with open(
    "../testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")


for key in bestHyperparams2019.keys():
    print('\n'+'TESTING RESULTS FOR BEST VALIDATION METRICS FOR {}'.format('normalized' if key == 'ccNormalized' else 'unnormalized'))
    for metric in bestHyperparams2019[key].keys():
        # print(f"best {metric} for {key} is {bestHyperparams2019[key][metric]['value']} with alpha {bestHyperparams2019[key][metric]['alpha']}")
    
        alpha = bestHyperparams2019[key][metric]['alpha']

        result = in_memory_index.get_scores(
            sparse_ranking_2020,
            queries,
            alpha=alpha*1.0,
            cutoff=None,
            early_stopping=False,
            normalise= False if key == 'ccUnnormalized' else True
        )
        qrels = list(read_trec_qrels("../testing/2020qrels-pass.txt"))
        print('using best alpha for {}'.format(metric))
        interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result[alpha].run)
        fscoreInterpolation2 = (2*interpolation2[P(rel=2)@10]*interpolation2[R(rel=2)@10])/(interpolation2[P(rel=2)@10]+interpolation2[R(rel=2)@10])
        print(
            f"Interpolation",
            interpolation2,
            'f-score:{}'.format(fscoreInterpolation2)
        )

loaded 200 queries

TESTING RESULTS FOR BEST VALIDATION METRICS FOR unnormalized
Encoding queries for interpolation...


100%|██████████| 54/54 [00:01<00:00, 45.47it/s]


sparse_ranking_2020 None
using best alpha for precision
Interpolation {R(rel=2)@10: 0.391207940493183, RR(rel=2)@10: 0.814814814814815, nDCG@10: 0.6939603772252907, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.2980423955640365} f-score:0.4513465019601671
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 57.84it/s]


sparse_ranking_2020 None
using best alpha for recall
Interpolation {R(rel=2)@10: 0.37349994843573076, RR(rel=2)@10: 0.797530864197531, nDCG@10: 0.6714497182268858, P(rel=2)@10: 0.5203703703703701, AP(rel=2)@10: 0.2876461836252215} f-score:0.43486913573863895
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 59.40it/s]


sparse_ranking_2020 None
using best alpha for fscore
Interpolation {R(rel=2)@10: 0.37349994843573076, RR(rel=2)@10: 0.797530864197531, nDCG@10: 0.6714497182268858, P(rel=2)@10: 0.5203703703703701, AP(rel=2)@10: 0.2876461836252215} f-score:0.43486913573863895
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 57.87it/s]


sparse_ranking_2020 None
using best alpha for rr
Interpolation {R(rel=2)@10: 0.37349994843573076, RR(rel=2)@10: 0.797530864197531, nDCG@10: 0.6714497182268858, P(rel=2)@10: 0.5203703703703701, AP(rel=2)@10: 0.2876461836252215} f-score:0.43486913573863895
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 58.99it/s]


sparse_ranking_2020 None
using best alpha for ndcg
Interpolation {R(rel=2)@10: 0.391207940493183, RR(rel=2)@10: 0.814814814814815, nDCG@10: 0.6939603772252907, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.2980423955640365} f-score:0.4513465019601671
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 65.64it/s]


sparse_ranking_2020 None
using best alpha for map
Interpolation {R(rel=2)@10: 0.391207940493183, RR(rel=2)@10: 0.814814814814815, nDCG@10: 0.6939603772252907, P(rel=2)@10: 0.5333333333333333, AP(rel=2)@10: 0.2980423955640365} f-score:0.4513465019601671

TESTING RESULTS FOR BEST VALIDATION METRICS FOR normalized
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 61.90it/s]


sparse_ranking_2020 None
using best alpha for precision
Interpolation {R(rel=2)@10: 0.3747600495688947, RR(rel=2)@10: 0.8021604938271607, nDCG@10: 0.6727571514417632, P(rel=2)@10: 0.5203703703703704, AP(rel=2)@10: 0.2869471420830232} f-score:0.43572203882293675
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 57.73it/s]


sparse_ranking_2020 None
using best alpha for recall
Interpolation {R(rel=2)@10: 0.3833530296325536, RR(rel=2)@10: 0.8271604938271606, nDCG@10: 0.6920854730231987, P(rel=2)@10: 0.525925925925926, AP(rel=2)@10: 0.2941106059928702} f-score:0.4434619229522968
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 63.51it/s]


sparse_ranking_2020 None
using best alpha for fscore
Interpolation {R(rel=2)@10: 0.3833530296325536, RR(rel=2)@10: 0.8271604938271606, nDCG@10: 0.6920854730231987, P(rel=2)@10: 0.525925925925926, AP(rel=2)@10: 0.2941106059928702} f-score:0.4434619229522968
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 67.26it/s]


sparse_ranking_2020 None
using best alpha for rr
Interpolation {R(rel=2)@10: 0.36361577768128583, RR(rel=2)@10: 0.7949294532627867, nDCG@10: 0.652755006564619, P(rel=2)@10: 0.4999999999999999, AP(rel=2)@10: 0.2759705606429033} f-score:0.4210388312468706
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 70.04it/s]


sparse_ranking_2020 None
using best alpha for ndcg
Interpolation {R(rel=2)@10: 0.3747600495688947, RR(rel=2)@10: 0.8021604938271607, nDCG@10: 0.6727571514417632, P(rel=2)@10: 0.5203703703703704, AP(rel=2)@10: 0.2869471420830232} f-score:0.43572203882293675
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 62.02it/s]


sparse_ranking_2020 None
using best alpha for map
Interpolation {R(rel=2)@10: 0.3747600495688947, RR(rel=2)@10: 0.8021604938271607, nDCG@10: 0.6727571514417632, P(rel=2)@10: 0.5203703703703704, AP(rel=2)@10: 0.2869471420830232} f-score:0.43572203882293675


## Testing RRF parameters

In [32]:
with open(
    "../testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")


for key in bestHyperparams2019_rrf.keys():
    print('\n'+'TESTING RESULTS FOR BEST VALIDATION METRICS FOR {}'.format('normalized' if key == 'ccNormalized' else 'unnormalized'))
    for metric in bestHyperparams2019_rrf[key].keys():
        # print(f"best {metric} for {key} is {bestHyperparams2019[key][metric]['value']} with alpha {bestHyperparams2019[key][metric]['alpha']}")
        eta1 = bestHyperparams2019_rrf[key][metric]['eta1']
        eta2 = bestHyperparams2019_rrf[key][metric]['eta2']
        alpha = 0.0
        result = in_memory_index.get_scores_rrf(
            sparse_ranking_2020,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = eta1,
            eta2 = eta2
        )
        qrels = list(read_trec_qrels("../testing/2020qrels-pass.txt"))
        print('using best eta1={} and eta2={} for {}'.format(eta1, eta2, metric))
        interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result[alpha].run)
        fscoreInterpolation2 = (2*interpolation2[P(rel=2)@10]*interpolation2[R(rel=2)@10])/(interpolation2[P(rel=2)@10]+interpolation2[R(rel=2)@10])
        print(
            f"RRF",
            interpolation2,
            'f-score:{}'.format(fscoreInterpolation2)
        )

loaded 200 queries

TESTING RESULTS FOR BEST VALIDATION METRICS FOR unnormalized
Encoding queries for RRF...


100%|██████████| 54/54 [00:01<00:00, 53.33it/s]


using best eta1=40 and eta2=10 for precision
RRF {R(rel=2)@10: 0.3702476168178879, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705968, P(rel=2)@10: 0.5037037037037037, AP(rel=2)@10: 0.28518417374234645} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 64.06it/s]


using best eta1=40 and eta2=10 for recall
RRF {R(rel=2)@10: 0.3702476168178879, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705968, P(rel=2)@10: 0.5037037037037037, AP(rel=2)@10: 0.28518417374234645} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 54.97it/s]


using best eta1=40 and eta2=10 for fscore
RRF {R(rel=2)@10: 0.3702476168178879, RR(rel=2)@10: 0.79320987654321, nDCG@10: 0.6793628703705968, P(rel=2)@10: 0.5037037037037037, AP(rel=2)@10: 0.28518417374234645} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 64.25it/s]


using best eta1=20 and eta2=20 for rr
RRF {R(rel=2)@10: 0.3525888949678008, RR(rel=2)@10: 0.809104938271605, nDCG@10: 0.639509948943491, P(rel=2)@10: 0.4814814814814814, AP(rel=2)@10: 0.26808911791402357} f-score:0.4070760173157598
Encoding queries for RRF...


100%|██████████| 54/54 [00:01<00:00, 45.70it/s]


using best eta1=80 and eta2=40 for ndcg
RRF {R(rel=2)@10: 0.37070757537644217, RR(rel=2)@10: 0.807716049382716, nDCG@10: 0.6639475222194774, P(rel=2)@10: 0.5037037037037038, AP(rel=2)@10: 0.28120310355515377} f-score:0.42709142293901964
Encoding queries for RRF...


100%|██████████| 54/54 [00:01<00:00, 53.52it/s]


using best eta1=100 and eta2=10 for map
RRF {R(rel=2)@10: 0.36834371388525794, RR(rel=2)@10: 0.8154320987654322, nDCG@10: 0.6829413068342425, P(rel=2)@10: 0.511111111111111, AP(rel=2)@10: 0.29184181741782134} f-score:0.42813925064420366


In [10]:
with open(
    "../testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019_rrf2 = {
    'rrf': {'precision': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'recall': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'fscore': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'rr': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'ndcg': {'value': -1, 'eta1': 0, 'eta2': 0}, 
            'map': {'value': -1, 'eta1': 0, 'eta2': 0}},
}

explored = set()

for i in tqdm(range(len(etas))):
    for j in range(len(etas)):
        if (etas[i],etas[j]) in explored:
            continue
        else:
            explored.add((etas[i],etas[j]))
            eta = etas[i]

        result2 = in_memory_index.get_scores_rrf(
            sparse_ranking_2019,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = etas[i],
            eta2 = etas[j],
        )

        qrels = list(read_trec_qrels("../testing/2019qrels-pass.txt"))

        rrfRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result2[alpha].run)
        fscoreRRRes = (2*rrfRes[P(rel=2)@10]*rrfRes[R(rel=2)@10])/(rrfRes[P(rel=2)@10]+rrfRes[R(rel=2)@10])


        if rrfRes[P(rel=2)@10] > bestHyperparams2019_rrf2['rrf']['precision']['value']:
            bestHyperparams2019_rrf2['rrf']['precision']['value'] = rrfRes[P(rel=2)@10]
            bestHyperparams2019_rrf2['rrf']['precision']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['precision']['eta2'] = etas[j]


        if rrfRes[R(rel=2)@10] > bestHyperparams2019_rrf2['rrf']['recall']['value']:
            bestHyperparams2019_rrf2['rrf']['recall']['value'] = rrfRes[R(rel=2)@10]
            bestHyperparams2019_rrf2['rrf']['recall']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['recall']['eta2'] = etas[j]


        if rrfRes[nDCG@10] > bestHyperparams2019_rrf2['rrf']['ndcg']['value']:
            bestHyperparams2019_rrf2['rrf']['ndcg']['value'] = rrfRes[nDCG@10]
            bestHyperparams2019_rrf2['rrf']['ndcg']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['ndcg']['eta2'] = etas[j]

        if rrfRes[RR(rel=2)@10] > bestHyperparams2019_rrf2['rrf']['rr']['value']:
            bestHyperparams2019_rrf2['rrf']['rr']['value'] = rrfRes[RR(rel=2)@10]
            bestHyperparams2019_rrf2['rrf']['rr']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['rr']['eta2'] = etas[j]

        if fscoreRRRes > bestHyperparams2019_rrf2['rrf']['fscore']['value']:
            bestHyperparams2019_rrf2['rrf']['fscore']['value'] = fscoreRRRes
            bestHyperparams2019_rrf2['rrf']['fscore']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['fscore']['eta2'] = etas[j]

        if rrfRes[AP(rel=2)@10] > bestHyperparams2019_rrf2['rrf']['map']['value']:
            bestHyperparams2019_rrf2['rrf']['map']['value'] = rrfRes[AP(rel=2)@10]
            bestHyperparams2019_rrf2['rrf']['map']['eta1'] = etas[i]
            bestHyperparams2019_rrf2['rrf']['map']['eta2'] = etas[j]

loaded 200 queries


  0%|          | 0/11 [00:00<?, ?it/s]

Encoding queries for RRF...


100%|██████████| 43/43 [00:01<00:00, 34.77it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.42it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.15it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 89.18it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 70.07it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.64it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 73.31it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.36it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 87.12it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.34it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 84.13it/s]
  9%|▉         | 1/11 [00:20<03:29, 20.96s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 94.46it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.72it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 72.36it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 63.99it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.82it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.24it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 93.56it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 84.93it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 72.62it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 98.95it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.24it/s]
 18%|█▊        | 2/11 [00:40<03:01, 20.22s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.82it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 85.98it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 100.83it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 93.64it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.14it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.06it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 87.68it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.58it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 52.70it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.67it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.89it/s]
 27%|██▋       | 3/11 [01:00<02:40, 20.11s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 50.66it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 73.68it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 59.05it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 93.38it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 77.14it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.17it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.06it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.94it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.84it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.29it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.85it/s]
 36%|███▋      | 4/11 [01:21<02:22, 20.39s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 87.92it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 94.13it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.14it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 90.06it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.42it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.56it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.94it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 92.30it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.63it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.89it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.92it/s]
 45%|████▌     | 5/11 [01:41<02:00, 20.11s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 63.64it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:01<00:00, 32.86it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 43.91it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.57it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 48.66it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.43it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.77it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.88it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.79it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 55.05it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 89.25it/s]
 55%|█████▍    | 6/11 [02:03<01:44, 21.00s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.96it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 70.27it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.43it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.20it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.37it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.60it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 91.48it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 70.17it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 81.01it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.89it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 100.62it/s]
 64%|██████▎   | 7/11 [02:23<01:22, 20.51s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.13it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 91.01it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 81.32it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.90it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.56it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 46.33it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 60.60it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.41it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 72.30it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 51.53it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 64.71it/s]
 73%|███████▎  | 8/11 [02:44<01:02, 20.73s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.66it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 74.16it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 79.39it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 62.54it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 60.16it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.52it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 56.29it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.73it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 76.50it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.34it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 71.02it/s]
 82%|████████▏ | 9/11 [03:05<00:41, 20.92s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.43it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 87.11it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.71it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 67.16it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 66.87it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 82.07it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.21it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 83.05it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.58it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 80.93it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.04it/s]
 91%|█████████ | 10/11 [03:26<00:20, 20.72s/it]

Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 69.97it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 75.13it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 63.03it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 59.79it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 88.15it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 65.38it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 86.43it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 84.52it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 73.14it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 78.54it/s]


Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 55.35it/s]
100%|██████████| 11/11 [03:46<00:00, 20.62s/it]


In [12]:
for key in bestHyperparams2019_rrf2['rrf']:
    print(key, bestHyperparams2019_rrf2['rrf'][key])

precision {'value': 0.6348837209302327, 'eta1': 40, 'eta2': 10}
recall {'value': 0.26976473421199393, 'eta1': 40, 'eta2': 10}
fscore {'value': 0.37864263683585114, 'eta1': 40, 'eta2': 10}
rr {'value': 0.8976744186046512, 'eta1': 20, 'eta2': 20}
ndcg {'value': 0.7151248552748896, 'eta1': 80, 'eta2': 40}
map {'value': 0.20941181623477223, 'eta1': 100, 'eta2': 10}


In [13]:
with open(
    "../testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")


for key in bestHyperparams2019_rrf2.keys():
    print('\n'+'TESTING RESULTS FOR BEST VALIDATION METRICS FOR {}'.format('normalized' if key == 'ccNormalized' else 'unnormalized'))
    for metric in bestHyperparams2019_rrf2[key].keys():
        # print(f"best {metric} for {key} is {bestHyperparams2019[key][metric]['value']} with alpha {bestHyperparams2019[key][metric]['alpha']}")
        eta1 = bestHyperparams2019_rrf2[key][metric]['eta1']
        eta2 = bestHyperparams2019_rrf2[key][metric]['eta2']
        alpha = 0.0
        result = in_memory_index.get_scores_rrf(
            sparse_ranking_2020,
            queries,
            alpha=alpha,
            cutoff=None,
            early_stopping=False,
            eta = eta1,
            eta2 = eta2
        )
        qrels = list(read_trec_qrels("../testing/2020qrels-pass.txt"))
        print('using best eta1={} and eta2={} for {}'.format(eta1, eta2, metric))
        interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P(rel=2)@10,R(rel=2)@10,AP(rel=2)@10], qrels, result[alpha].run)
        fscoreInterpolation2 = (2*interpolation2[P(rel=2)@10]*interpolation2[R(rel=2)@10])/(interpolation2[P(rel=2)@10]+interpolation2[R(rel=2)@10])
        print(
            f"RRF",
            interpolation2,
            'f-score:{}'.format(fscoreInterpolation2)
        )

loaded 200 queries

TESTING RESULTS FOR BEST VALIDATION METRICS FOR unnormalized
Encoding queries for RRF...


100%|██████████| 54/54 [00:01<00:00, 48.28it/s]


using best eta1=40 and eta2=10 for precision
RRF {RR(rel=2)@10: 0.79320987654321, P(rel=2)@10: 0.5037037037037037, nDCG@10: 0.6793628703705965, AP(rel=2)@10: 0.2851841737423465, R(rel=2)@10: 0.3702476168178879} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 82.15it/s]


using best eta1=40 and eta2=10 for recall
RRF {RR(rel=2)@10: 0.79320987654321, P(rel=2)@10: 0.5037037037037037, nDCG@10: 0.6793628703705965, AP(rel=2)@10: 0.2851841737423465, R(rel=2)@10: 0.3702476168178879} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 96.18it/s] 


using best eta1=40 and eta2=10 for fscore
RRF {RR(rel=2)@10: 0.79320987654321, P(rel=2)@10: 0.5037037037037037, nDCG@10: 0.6793628703705965, AP(rel=2)@10: 0.2851841737423465, R(rel=2)@10: 0.3702476168178879} f-score:0.42678600397865596
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 81.43it/s]


using best eta1=20 and eta2=20 for rr
RRF {RR(rel=2)@10: 0.809104938271605, P(rel=2)@10: 0.48148148148148157, nDCG@10: 0.6395099489434912, AP(rel=2)@10: 0.2680891179140236, R(rel=2)@10: 0.3525888949678009} f-score:0.40707601731575993
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 75.30it/s]


using best eta1=80 and eta2=40 for ndcg
RRF {RR(rel=2)@10: 0.807716049382716, P(rel=2)@10: 0.5037037037037038, nDCG@10: 0.6639475222194774, AP(rel=2)@10: 0.2812031035551538, R(rel=2)@10: 0.37070757537644233} f-score:0.4270914229390197
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 77.31it/s] 


using best eta1=100 and eta2=10 for map
RRF {RR(rel=2)@10: 0.8154320987654322, P(rel=2)@10: 0.511111111111111, nDCG@10: 0.6829413068342426, AP(rel=2)@10: 0.2918418174178213, R(rel=2)@10: 0.36834371388525794} f-score:0.42813925064420366
