In [9]:
import csv
from pathlib import Path
import numpy as np
from ranking import Ranking
from index import Mode, InMemoryIndex
from encoder import TCTColBERTQueryEncoder as TCTColBERTQueryEncoderFF
# from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from ir_measures import read_trec_qrels, calc_aggregate, nDCG, RR, P, R
import sys

# define values of eta to test
etas = [0,10,20,30,40,50,60,70,80,90,100,500]

# define values of alpha to test
alphas = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,-1]

In [4]:
# in_memory_index = InMemoryIndex(mode=Mode.PASSAGE, encoder=TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco"))
in_memory_index = InMemoryIndex.from_disk('../../dev/testing/ffindex_passage_2019_2020.pkl')
in_memory_index.encoder = TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco")

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
sparse_ranking_2019 = Ranking.from_file(Path("../../dev/testing/msmarco-passage-test2019-sparse10000.txt"))
sparse_ranking_2019.cut(5000)
sparse_ranking_2019.name = "sparse_ranking_2019"
sparse_ranking_2020 = Ranking.from_file(Path("../../dev/testing/msmarco-passage-test2020-sparse10000.txt"))
sparse_ranking_2020.cut(5000)
sparse_ranking_2020.name = "sparse_ranking_2020"
all_ids = set.union(
    *[set(sparse_ranking_2019[q_id].keys()) for q_id in sparse_ranking_2019.q_ids],
    *[set(sparse_ranking_2020[q_id].keys()) for q_id in sparse_ranking_2020.q_ids]
)
print(f"indexing {len(all_ids)} documents or passages")

indexing 440079 documents or passages


In [15]:
with open(
    "../../dev/testing/msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2019 = {
    'ccUnnormalized': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}},
    'rrf': {'precision': {'value': -1, 'eta': 0}, 'recall': {'value': -1, 'eta': 0}, 'fscore': {'value': -1, 'eta': 0}, 'rr': {'value': -1, 'eta': 0}, 'ndcg': {'value': -1, 'eta': 0}},
    'ccNormalized': {'precision': {'value': -1, 'alpha': 0}, 'recall': {'value': -1, 'alpha': 0}, 'fscore': {'value': -1, 'alpha': 0}, 'rr': {'value': -1, 'alpha': 0}, 'ndcg': {'value': -1, 'alpha': 0}},
}


for i in range(len(etas)):
    if alphas[i] == -1:
        alpha = 1
    eta = etas[i]

    result = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise=False
    )
    result2 = in_memory_index.get_scores_rrf(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False
        eta = eta
    )
    result3 = in_memory_index.get_scores(
        sparse_ranking_2019,
        queries,
        alpha=alpha,
        cutoff=None,
        early_stopping=False,
        normalise=True
    )
    qrels = list(read_trec_qrels("../../dev/testing/2019qrels-pass.txt"))
    bm25Res = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, sparse_ranking_2019.run)
    fscoreBM25Res = (2*bm25Res[P@10]*bm25Res[R@10])/(bm25Res[P@10]+bm25Res[R@10])
    print(list(bm25Res.keys())[0])
    print(
        "BM25",
        bm25Res,
        'f-score:{}'.format()
    )

    interpolationRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result[alpha].run)
    fscoreInterpolationRes = (2*interpolationRes[P@10]*interpolationRes[R@10])/(interpolationRes[P@10]+interpolationRes[R@10])
    print(
        f"Interpolation",
        interpolationRes,
        'f-score:{}'.format(fscoreInterpolationRes)
    )

    rrfRes = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result2[alpha].run)
    fscoreRRRes = (2*rrfRes[P@10]*rrfRes[R@10])/(rrfRes[P@10]+rrfRes[R@10])
    print(
        f"RRF",
        rrfRes,
        'f-score:{}'.format()
    )

    interpolationMinMax = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result3[alpha].run)
    fscoreInterpolationMinMax = (2*interpolationMinMax[P@10]*interpolationMinMax[R@10])/(interpolationMinMax[P@10]+interpolationMinMax[R@10])
    print(
        f"Interpolation w/ minimax",
        interpolationMinMax,
        'f-score:{}'.format(fscoreInterpolationMinMax)
    )

    # update dictionary of best parameters per method
    if interpolationRes[P@10] > bestHyperparams2019['ccUnnormalized']['precision']:
        bestHyperparams2019['ccUnnormalized']['precision']['value'] = interpolationRes[P@10]
        bestHyperparams2019['ccUnnormalized']['precision']['alpha'] = alpha

    if interpolationRes[R@10] > bestHyperparams2019['ccUnnormalized']['recall']:
        bestHyperparams2019['ccUnnormalized']['recall']['value'] = interpolationRes[R@10]
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha

    if interpolationRes[nDCG@10] > bestHyperparams2019['ccUnnormalized']['ndcg']:
        bestHyperparams2019['ccUnnormalized']['ndcg'] = interpolationRes[nDCG@10]
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha

    if interpolationRes[RR(rel=2)@10] > bestHyperparams2019['ccUnnormalized']['rr']:
        bestHyperparams2019['ccUnnormalized']['rr']['value'] = interpolationRes[RR(rel=2)@10]
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha
    
    if fscoreInterpolationRes > bestHyperparams2019['ccUnnormalized']['fscore']:
        bestHyperparams2019['ccUnnormalized']['fscore']['value'] = fscoreInterpolationRes
        bestHyperparams2019['ccUnnormalized']['recall']['alpha'] = alpha

    if rrfRes[P@10] > bestHyperparams2019['rrf']['precision']:
        bestHyperparams2019['rrf']['precision']['value'] = rrfRes[P@10]
        bestHyperparams2019['rrf']['precision']['eta'] = eta


    if rrfRes[R@10] > bestHyperparams2019['rrf']['recall']:
        bestHyperparams2019['rrf']['recall']['value'] = rrfRes[R@10]
        bestHyperparams2019['rrf']['recall']['eta'] = eta


    if rrfRes[nDCG@10] > bestHyperparams2019['rrf']['ndcg']:
        bestHyperparams2019['rrf']['ndcg']['value'] = rrfRes[nDCG@10]
        bestHyperparams2019['rrf']['ndcg']['eta'] = eta

    if rrfRes[RR(rel=2)@10] > bestHyperparams2019['rrf']['rr']:
        bestHyperparams2019['rrf']['rr']['value'] = rrfRes[RR(rel=2)@10]
        bestHyperparams2019['rrf']['rr']['eta'] = eta
    
    if fscoreRRRes > bestHyperparams2019['rrf']['fscore']:
        bestHyperparams2019['rrf']['fscore']['value'] = fscoreRRRes
        bestHyperparams2019['rrf']['fscore']['eta'] = eta
    
    if interpolationMinMax[P@10] > bestHyperparams2019['ccNormalized']['precision']:
        bestHyperparams2019['ccNormalized']['precision']['value'] = interpolationMinMax[P@10]
        bestHyperparams2019['ccNormalized']['precision']['alpha'] = alpha

    if interpolationMinMax[R@10] > bestHyperparams2019['ccNormalized']['recall']:
        bestHyperparams2019['ccNormalized']['recall']['value'] = interpolationMinMax[R@10]
        bestHyperparams2019['ccNormalized']['recall']['alpha'] = alpha

    if interpolationMinMax[nDCG@10] > bestHyperparams2019['ccNormalized']['ndcg']:
        bestHyperparams2019['ccNormalized']['ndcg']['value'] = interpolationMinMax[nDCG@10]
        bestHyperparams2019['ccNormalized']['ndcg']['alpha'] = alpha

    if interpolationMinMax[RR(rel=2)@10] > bestHyperparams2019['ccNormalized']['rr']:
        bestHyperparams2019['ccNormalized']['rr']['value'] = interpolationMinMax[RR(rel=2)@10]
        bestHyperparams2019['ccNormalized']['rr']['alpha'] = alpha
    
    if fscoreInterpolationMinMax > bestHyperparams2019['ccNormalized']['fscore']:
        bestHyperparams2019['ccNormalized']['fscore']['value'] = fscoreInterpolationMinMax
        bestHyperparams2019['ccNormalized']['fscore']['alpha'] = alpha

loaded 200 queries
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 66.25it/s]


sparse_ranking_2019 None
Encoding queries for RRF...


100%|██████████| 43/43 [00:00<00:00, 68.16it/s]


sparse_ranking_2019 None
Encoding queries for interpolation...


100%|██████████| 43/43 [00:00<00:00, 70.20it/s]


sparse_ranking_2019 None
P@10
BM25 {P@10: 0.6186046511627907, R@10: 0.12847703679948552, nDCG@10: 0.5058310024399072, RR(rel=2)@10: 0.7024178663713547} f-score:0.2127652004121614
Interpolation {P@10: 0.8069767441860466, R@10: 0.17310336499562765, nDCG@10: 0.7158066715626032, RR(rel=2)@10: 0.901937984496124} f-score:0.28505912645948106
RRF {P@10: 0.7767441860465117, R@10: 0.16565014746118, nDCG@10: 0.6816417471378419, RR(rel=2)@10: 0.8775193798449613} f-score:0.27306571014556874
Interpolation w/ minimax {P@10: 0.8162790697674419, R@10: 0.17355258664965964, nDCG@10: 0.7229942609361284, RR(rel=2)@10: 0.8949612403100775} f-score:0.2862453288247246


In [17]:
with open(
    "../../dev/testing/msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

# define dictionary to keep track of best results for tested hyperparameters
bestHyperparams2020 = {
    'ccUnnormalized': {'precision': -1, 'recall': -1, 'fscore': -1, 'rr': -1, 'ndcg': -1},
    'rrf': {'precision': -1, 'recall': -1, 'fscore': -1, 'rr': -1, 'ndcg': -1},
    'ccNormalized': {'precision': -1, 'recall': -1, 'fscore': -1, 'rr': -1, 'ndcg': -1}
}

alpha = 0.2
eta = 60
# for i in range(len(etas)):
#     if alphas[i] == -1:
#         alpha = 1
#     eta = etas[i]

result = in_memory_index.get_scores(
    sparse_ranking_2020,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    normalise=False
)
result2 = in_memory_index.get_scores_rrf(
    sparse_ranking_2020,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False
    eta = eta
)
result3 = in_memory_index.get_scores(
    sparse_ranking_2020,
    queries,
    alpha=alpha,
    cutoff=None,
    early_stopping=False,
    normalise=True
)
qrels = list(read_trec_qrels("../../dev/testing/2020qrels-pass.txt"))
bm25Res2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, sparse_ranking_2020.run)
fscoreBM25Res2 = (2*bm25Res2[P@10]*bm25Res2[R@10])/(bm25Res2[P@10]+bm25Res2[R@10])
print(
    "BM25",
    bm25Res2,
    'f-score:{}'.format(fscoreBM25Res2)
)


interpolation2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result[alpha].run)
fscoreInterpolation2 = (2*interpolation2[P@10]*interpolation2[R@10])/(interpolation2[P@10]+interpolation2[R@10])
print(
    f"Interpolation",
    interpolation2,
    'f-score:{}'.format(fscoreInterpolation2)
)

rrf2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result2[alpha].run)
fscoreRRF2 = (2*rrf2[P@10]*rrf2[R@10])/(rrf2[P@10]+rrf2[R@10])
print(
    f"RRF",
    rrf2,
    'f-score:{}'.format(fscoreRRF2)
)


interpolationMinmax2 = calc_aggregate([nDCG@10, RR(rel=2)@10,P@10,R@10], qrels, result3[alpha].run)
fscoreInterpolationMinMax2 = (2*interpolationMinmax2[P@10]*interpolationMinmax2[R@10])/(interpolationMinmax2[P@10]+interpolationMinmax2[R@10])
print(
    f"Interpolation w/ minimax",
    interpolationMinmax2,
    'f-score:{}'.format(fscoreInterpolationMinMax2)
)

   # update dictionary of best parameters per method
    # if interpolation2[P@10] > bestHyperparams2020['ccUnnormalized']['precision']:
    #     bestHyperparams2020['ccUnnormalized']['precision'] = interpolation2[P@10]

    # if interpolation2[R@10] > bestHyperparams2020['ccUnnormalized']['recall']:
    #     bestHyperparams2020['ccUnnormalized']['recall'] = interpolation2[R@10]

    # if interpolation2[nDCG@10] > bestHyperparams2020['ccUnnormalized']['ndcg']:
    #     bestHyperparams2020['ccUnnormalized']['ndcg'] = interpolation2[nDCG@10]

    # if interpolation2[RR(rel=2)@10] > bestHyperparams2020['ccUnnormalized']['rr']:
    #     bestHyperparams2020['ccUnnormalized']['rr'] = interpolation2[RR(rel=2)@10]
    
    # if fscoreInterpolation2 > bestHyperparams2020['ccUnnormalized']['fscore']:
    #     bestHyperparams2020['ccUnnormalized']['fscore'] = fscoreInterpolation2

    # if rrf2[P@10] > bestHyperparams2020['rrf']['precision']:
    #     bestHyperparams2020['rrf']['precision'] = rrf2[P@10]

    # if rrf2[R@10] > bestHyperparams2020['rrf']['recall']:
    #     bestHyperparams2020['rrf']['recall'] = rrf2[R@10]

    # if rrf2[nDCG@10] > bestHyperparams2020['rrf']['ndcg']:
    #     bestHyperparams2020['rrf']['ndcg'] = rrf2[nDCG@10]

    # if rrf2[RR(rel=2)@10] > bestHyperparams2020['rrf']['rr']:
    #     bestHyperparams2020['rrf']['rr'] = rrf2[RR(rel=2)@10]
    
    # if fscoreRRF2 > bestHyperparams2020['rrf']['fscore']:
    #     bestHyperparams2020['rrf']['fscore'] = fscoreRRF2
    
    # if interpolationMinmax2[P@10] > bestHyperparams2020['ccNormalized']['precision']:
    #     bestHyperparams2020['ccNormalized']['precision'] = interpolationMinmax2[P@10]

    # if interpolationMinmax2[R@10] > bestHyperparams2020['ccNormalized']['recall']:
    #     bestHyperparams2020['ccNormalized']['recall'] = interpolationMinmax2[R@10]

    # if interpolationMinmax2[nDCG@10] > bestHyperparams2020['ccNormalized']['ndcg']:
    #     bestHyperparams2020['ccNormalized']['ndcg'] = interpolationMinmax2[nDCG@10]

    # if interpolationMinmax2[RR(rel=2)@10] > bestHyperparams2020['ccNormalized']['rr']:
    #     bestHyperparams2020['ccNormalized']['rr'] = interpolationMinmax2[RR(rel=2)@10]
    
    # if fscoreInterpolationMinMax2 > bestHyperparams2020['ccNormalized']['fscore']:
    #     bestHyperparams2020['ccNormalized']['fscore'] = fscoreInterpolationMinMax2

    

loaded 200 queries
Encoding queries for interpolation...


100%|██████████| 54/54 [00:01<00:00, 45.76it/s]


sparse_ranking_2020 None
Encoding queries for RRF...


100%|██████████| 54/54 [00:00<00:00, 60.40it/s]


sparse_ranking_2020 None
Encoding queries for interpolation...


100%|██████████| 54/54 [00:00<00:00, 68.46it/s]


sparse_ranking_2020 None
BM25 {P@10: 0.5425925925925926, R@10: 0.16942969387735296, nDCG@10: 0.4875508583120806, RR(rel=2)@10: 0.6554012345679013} f-score:0.25822589716638783
Interpolation {P@10: 0.7166666666666665, R@10: 0.22541387306514282, nDCG@10: 0.6714497182268858, RR(rel=2)@10: 0.7975308641975309} f-score:0.3429571086905328
RRF {P@10: 0.6574074074074073, R@10: 0.20925932337121278, nDCG@10: 0.628069774120618, RR(rel=2)@10: 0.8151455026455026} f-score:0.3174660440229534
Interpolation w/ minimax {P@10: 0.7166666666666663, R@10: 0.22401114929417368, nDCG@10: 0.6727571514417631, RR(rel=2)@10: 0.8021604938271606} f-score:0.34133115703774114
