In [1]:
import pandas as pd
import bm25s
from pathlib import Path
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from mxbai_rerank import MxbaiRerankV2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [3]:
data_path = Path('data/')

collection_data_path = data_path / 'subtask4b_collection_data.pkl' 
query_dev_data_path = data_path / 'subtask4b_query_tweets_dev.tsv'
query_train_data_path = data_path / 'subtask4b_query_tweets_train.tsv'

In [4]:
df_collection = pd.read_pickle(collection_data_path)
df_collection.head(5)

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000
993,ycxyn2a2,PMC,What was the primary mode of smallpox transmis...,10.3389/fcimb.2012.00150,PMC3509329,23226686,cc-by,The mode of infection transmission has profoun...,2012-11-29,"Milton, Donald K.",Front Cell Infect Microbiol,,,,ycxyn2a2,2012-11-29,1354147200
1053,zxe95qy9,PMC,"Lessons from the History of Quarantine, from P...",10.3201/eid1902.120312,PMC3559034,23343512,no-cc,"In the new millennium, the centuries-old strat...",2013-02-03,"Tognotti, Eugenia",Emerg Infect Dis,,,,zxe95qy9,2013-02-03,1359849600


In [5]:
df_query_dev = pd.read_csv(query_dev_data_path, sep='\t')
df_query_train = pd.read_csv(query_train_data_path, sep='\t')
display(df_query_dev)
display(df_query_train)

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
1395,14193,Residents at high risk of covid-19: effectiven...,0gn3b98n
1396,14196,"61% of teenagers hospitalized for covid were ""...",25bdifv6
1397,14203,"""fresh evidence backing melatonin against covi...",qn6wawxk
1398,14233,"the vaccine doesn't halt the spread, it is pro...",3u3i5myh


Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69
...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [6]:
df_query_combined = pd.concat([df_query_dev, df_query_train], ignore_index=True)
df_query_combined

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
14248,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
14249,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
14250,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
14251,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [7]:
df_collection['last_names'] = df_collection['authors'].str.split(';').apply(
    lambda authors: [author.split(',')[0].strip() for author in authors] if isinstance(authors, list) else authors
)
df_collection['last_names'] = df_collection['last_names'].apply(
    lambda x: '; '.join(x) if isinstance(x, list) else x
)
df_collection['publish_year'] = df_collection['publish_time'].str.split('-').str[0]

In [8]:
def initialize_bm25(corpus: list[str], cord_uids:list[str], k1=1.5, b=0.75, method='lucene', stemmer=None):
    tokenized_corpus = bm25s.tokenize(corpus, stemmer=stemmer)
    bm25 = bm25s.BM25(corpus=cord_uids, k1=k1, b=b, method=method)
    bm25.index(tokenized_corpus)
    return bm25

In [9]:
def experiment_single_bm25(df_collection, df_query, k1=1.5, b=0.75, stemmer=None, k=10):
    corpus = df_collection.apply(
        lambda x: f"{x['title']} {x['abstract']} {x['last_names']} {x['journal']} {x['publish_year']}", axis=1
    ).tolist()
    bm25 = initialize_bm25(corpus, df_collection['cord_uid'].tolist(), k1, b, stemmer=stemmer)
    tokenized_queries = bm25s.tokenize(df_query['tweet_text'], stemmer=stemmer)
    doc_scores = bm25.retrieve(tokenized_queries, n_threads=-1, k=k)
    df_query['bm25_topk'] = doc_scores.documents.tolist()
    
    return df_query

In [10]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
        print(f"{k = }")
        in_topx = data["in_topx"] > 0
        print(f"Number of queries in top {k}: {in_topx.sum()}")
        print(f"Number of queries not in top {k}: {len(data) - in_topx.sum()}")
    return d_performance

In [11]:
def evaluate_experiment(df_query_train, df_query_dev, experiment_name, list_k=[1, 5, 10]):
    results = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, train: {results}")
    results = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, dev: {results}")
    return results

In [12]:

stemmer = None
df_query_train_single = experiment_single_bm25(df_collection, df_query_train, stemmer=stemmer, k=1000)
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=1000)
evaluate_experiment(df_query_train_single, df_query_dev_single, "Single BM25 with all features", list_k=[1, 5, 10, 100, 200, 500, 700, 1000, 2000, 5000])

                                                                            

k = 1
Number of queries in top 1: 7542
Number of queries not in top 1: 5311
k = 5
Number of queries in top 5: 9094
Number of queries not in top 5: 3759
k = 10
Number of queries in top 10: 9630
Number of queries not in top 10: 3223
k = 100
Number of queries in top 100: 11156
Number of queries not in top 100: 1697
k = 200
Number of queries in top 200: 11512
Number of queries not in top 200: 1341
k = 500
Number of queries in top 500: 11956
Number of queries not in top 500: 897
k = 700
Number of queries in top 700: 12089
Number of queries not in top 700: 764
k = 1000
Number of queries in top 1000: 12210
Number of queries not in top 1000: 643
k = 2000
Number of queries in top 2000: 12210
Number of queries not in top 2000: 643
k = 5000
Number of queries in top 5000: 12210
Number of queries not in top 5000: 643
Results for Single BM25 with all features, train: {1: 0.5867890764801992, 5: 0.6347999170102959, 10: 0.6403735648153294, 100: 0.6449850219400036, 200: 0.6451818481542875, 500: 0.645294

{1: 0.5921428571428572,
 5: 0.6397619047619048,
 10: 0.6450986394557824,
 100: 0.6495240324689373,
 200: 0.6497567832248164,
 500: 0.6498352020281756,
 700: 0.6498638803520272,
 1000: 0.6498750349577517,
 2000: 0.6498750349577517,
 5000: 0.6498750349577517}

We will use BM25 as a baseline ranking model. The experiment above shouws us that there is no improvement after the first 1k (doesnt matter whether we take top 1k or top 2k etc.) -> so for each query we will prefilter the documents with BM25 to the top 1k and then the reranker will take over.

In [13]:
def initialize_reranker(model_name, torch_dtype=torch.float16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True, torch_dtype=torch.float16
    )
    if torch.cuda.is_available():
        print("Using GPU")
        model = model.to('cuda')
    model.eval()
    return tokenizer, model

In [14]:
def rerank_with_alibaba(df_query, df_collection, tokenizer, model, k=10, max_length=512, batch_size=32):
    reranked_results = []
    reranked_scores = []
    reranked_docs = []

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk'] 

        pairs = [
            [
                query_text, 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]
        
        batch_scores = []
        batch_docs = []

        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            with torch.no_grad():
                inputs = tokenizer(batch_pairs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
                if torch.cuda.is_available():
                    inputs = {key: value.to('cuda') for key, value in inputs.items()}
                scores = model(**inputs, return_dict=True).logits.view(-1).float()

            batch_scores.extend(scores.tolist())
            batch_docs.extend(topk_docs[i:i+batch_size])


        reranked = sorted(zip(batch_docs, batch_scores), key=lambda x: x[1], reverse=True)
        reranked_results.append([doc_id for doc_id, _ in reranked[:k]])
        reranked_scores.append([score for _, score in reranked])
        reranked_docs.append([doc_id for doc_id, _ in reranked])

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_docs
    df_query['reranked_scores'] = reranked_scores
    return df_query

In [15]:
def rerank_mxbai(df_query, df_collection, model_name, k=10, batch_size=1):
    reranked_results = []    
    reranked_scores = []

    model = MxbaiRerankV2(model_name, torch_dtype=torch.float16)

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk']  # Get top-k BM25 results for the query

        documents = [
            [
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]

        results = model.rank(query_text, documents, return_documents=True, top_k=k, batch_size=batch_size)
        reranked = [topk_docs[result.index] for result in results]
        reranked_score = [result.score for result in results]

        reranked_results.append(reranked)
        reranked_scores.append(reranked_score)

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_results
    df_query['reranked_scores'] = reranked_scores
    
    return df_query

In [16]:
def evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10]):
    return get_performance_mrr(df_query, col_gold, col_pred, list_k)

## Experiments

In [None]:
df_query_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-reranker-modernbert-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Exception ignored in:                                                       <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fd9e42dfa10>>
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
                                                                       

Using GPU


Reranking:   0%|          | 12/14253 [00:06<2:06:33,  1.88it/s]

In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-multilingual-reranker-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_multilingual.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:49:21<00:00,  2.17it/s]


k = 1
Number of queries in top 1: 9026
Number of queries not in top 1: 5227
k = 5
Number of queries in top 5: 10949
Number of queries not in top 5: 3304
k = 10
Number of queries in top 10: 11361
Number of queries not in top 10: 2892
Evaluation results: {1: np.float64(0.6332701887321968), 5: np.float64(0.6880668397296476), 10: np.float64(0.6920471490825101)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=200)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-TinyBERT-L2-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_tinyBERT.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[1.2587890625, 0.71435546875, 0.4404296875, 0....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[3.333984375, 0.349853515625, 0.17138671875, 0...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[0.309814453125, -0.06854248046875, -0.1201171...",1.0
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[0.83642578125, 0.22509765625, 0.1030883789062...",1.0
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[0.22314453125, 0.08758544921875, -0.127197265...",1.0


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:21<00:00,  2.63it/s]


k = 1
Number of queries in top 1: 7438
Number of queries not in top 1: 6815
k = 5
Number of queries in top 5: 9521
Number of queries not in top 5: 4732
k = 10
Number of queries in top 10: 10215
Number of queries not in top 10: 4038
Evaluation results: {1: np.float64(0.5218550480600576), 5: np.float64(0.5785320049580205), 10: np.float64(0.5850188876972712)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=100)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[7.078125, 6.046875, 5.74609375, 5.4140625, 5....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[6.75390625, 1.7626953125, 1.494140625, 1.3847...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[0.5537109375, -0.328857421875, -0.48095703125...",0.333333
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[-0.0164794921875, -1.2412109375, -1.251953125...",0.333333
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[4.51171875, 4.36328125, 4.16015625, 2.578125,...",0.5


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:00<00:00,  2.64it/s]


k = 1
Number of queries in top 1: 7613
Number of queries not in top 1: 6640
k = 5
Number of queries in top 5: 9648
Number of queries not in top 5: 4605
k = 10
Number of queries in top 10: 10355
Number of queries not in top 10: 3898
Evaluation results: {1: np.float64(0.5341331649477303), 5: np.float64(0.5887462288640988), 10: np.float64(0.5954533659858855)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=40)
model_name = "cross-encoder/ms-marco-electra-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet("data/reranked_results_electra.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:20:58<00:00,  2.93it/s]


k = 1
Number of queries in top 1: 7032
Number of queries not in top 1: 7221
k = 5
Number of queries in top 5: 9334
Number of queries not in top 5: 4919
k = 10
Number of queries in top 10: 10228
Number of queries not in top 10: 4025
Evaluation results: {1: np.float64(0.49336981688065673), 5: np.float64(0.5558163193713604), 10: np.float64(0.5641402422659)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=25)
df_query = rerank_mxbai(df_query_dev_single, df_collection, "mixedbread-ai/mxbai-rerank-base-v2", batch_size=1)
df_query.to_parquet("data/reranked_results_mxbai.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Reranking:   0%|          | 0/14253 [00:00<?, ?it/s]                        You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Reranking: 100%|██████████| 14253/14253 [4:14:34<00:00,  1.07s/it]  


k = 1
Number of queries in top 1: 9784
Number of queries not in top 1: 4469
k = 5
Number of queries in top 5: 11050
Number of queries not in top 5: 3203
k = 10
Number of queries in top 10: 11274
Number of queries not in top 10: 2979
Evaluation results: {1: np.float64(0.6864519750228022), 5: np.float64(0.7231974555064431), 10: np.float64(0.7253506362904384)}


## Finetune

In [17]:
from datasets import Dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss, MultipleNegativesRankingLoss, LambdaLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer


In [18]:
df_collection['answer'] = df_collection.apply(
    lambda row: f"{row['title']} {row['abstract']} {row['last_names']} {row['journal']}", axis=1
)
df_collection.head(3)   

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet,last_names,publish_year,answer
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600,van der Sande; Teunis; Sabel,2008,Professional and Home-Made Face Masks Reduce E...
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800,Li; Blakeley; Smith?,2011,The Failure of R (0) The basic reproductive ra...
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000,Singh; Sharma; Patel,2012,Pulmonary sequelae in a patient recovered from...


In [19]:
df_train = df_query_train.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_dev = df_query_dev.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_full = pd.concat([df_train, df_dev], ignore_index=True)
df_full.head(3)

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,answer
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,"[htlvpvz5, h7hj64q5, 4aps0kvp, 5tkyir3r, 32z7b...",1.0,Oral Management in Rehabilitation Medicine: Or...
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,"[maj8r6ti, bjvg2ivr, 7tto4hr7, 2cwvga0k, 46je8...",0.003145,Variation in racial/ethnic disparities in COVI...
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"[jtwb17u8, veeavho5, jbpmbm9m, 8hkxbxz9, 32v44...",1.0,Effect of non-pharmaceutical interventions for...


In [20]:
train_dataset = Dataset.from_dict({
    "query": df_train['tweet_text'].tolist(),
    "document": df_train['answer'].tolist(),
})

dev_dataset = Dataset.from_dict({
    "query": df_dev['tweet_text'].tolist(),
    "document": df_dev['answer'].tolist(),
})
full_dataset = Dataset.from_dict({
    "query": df_full['tweet_text'].tolist(),
    "document": df_full['answer'].tolist(),
})

In [21]:
def get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives, num_hard_negatives_eval, train_dataset, dev_dataset, df_full, batch_size=4096, loss='crossentropy'):

    embedding_model = SentenceTransformer(embedding_model_hard_negatives, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    # hard_train_dataset = mine_hard_negatives(
    #     train_dataset,
    #     embedding_model,
    #     num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
    #     range_min=3,  # Skip the x most similar samples
    #     range_max=100,  # Consider only the x most similar samples
    #     max_score=0.95,  # Only consider samples with a similarity score of at most x
    #     margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
    #     sampling_strategy="top",  # Randomly sample negatives from the range
    #     batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
    #     output_format="labeled-pair",  # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
    # )

    # hard_eval_dataset = mine_hard_negatives(
    #     dev_dataset,
    #     embedding_model,
    #     corpus=df_full["answer"],  # Use the full dataset as the corpus
    #     num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
    #     batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
    #     output_format="n-tuple",  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
    #     include_positives=True,  # Key: Include the positive answer in the list of negatives
    #     range_min=3,  # Skip the x most similar samples
    #     range_max=100,  # Consider only the x most similar samples
    #     max_score=0.95,  # Only consider samples with a similarity score of at most x
    #     margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
    #     sampling_strategy="top",  # Randomly sample negatives from the range
    # )
    if loss == 'crossentropy':
        output_format = "labeled-pair"
    elif loss == 'lambda':
        output_format = "labeled-list"
    else:
        raise ValueError("Invalid loss type. Use 'crossentropy' or 'lambda'.")
    hard_train_dataset = mine_hard_negatives(
        train_dataset,
        embedding_model,
        num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
        range_min=5,  # Skip the x most similar samples
        range_max=100,  # Consider only the x most similar samples
        max_score=0.8,  # Only consider samples with a similarity score of at most x
        margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
        sampling_strategy="top",  # Randomly sample negatives from the range
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        output_format=output_format,  # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
    )

    print("======================")
    hard_eval_dataset = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        output_format=output_format,  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
    )

    print("======================")
    hard_eval_dataset_evaluator = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        include_positives=True,  # Key: Include the positive answer in the list of negatives
    )

    return hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator


class ExperimentRunner:
    def __init__(self, model_to_finetune, train_batch_size, num_epochs, max_length, experiment_name):
        self.experiment_name = experiment_name
        self.model_to_finetune = model_to_finetune
        self.train_batch_size = train_batch_size
        self.num_epochs = num_epochs
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def run_experiment(self, hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, trust_remote_code=False, eval_steps=None, loss='crossentropy'):

        if eval_steps is None:
            eval_steps = int(3000/self.train_batch_size)
        
        print(f"Eval steps: {eval_steps}")
        model = CrossEncoder(self.model_to_finetune, max_length=self.max_length, device=self.device, trust_remote_code=trust_remote_code)

        args = CrossEncoderTrainingArguments(
            # Required parameter:
            output_dir=f"model/{self.model_to_finetune.split("/")[-1]}{self.experiment_name}",
            # Optional training parameters:
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.train_batch_size,
            per_device_eval_batch_size=self.train_batch_size,
            learning_rate=2e-6,
            warmup_ratio=0.1,
            fp16=True,  
            bf16=False, 
            batch_sampler=BatchSamplers.NO_DUPLICATES, 
            eval_strategy="steps",
            eval_steps=eval_steps,
            save_strategy="steps",
            save_steps=eval_steps,
            save_total_limit=10,
            logging_steps=eval_steps,
            run_name=self.model_to_finetune.split("/")[-1],
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            eval_on_start=True,
        )
        
        reranking_evaluator = CrossEncoderRerankingEvaluator(
            samples=[
                {
                    "query": sample["query"],
                    "positive": [sample["document"]],
                    "documents": [sample[column_name] for column_name in hard_eval_dataset_evaluator.column_names if 'negative' in column_name],
                }
                for sample in hard_eval_dataset_evaluator
            ],
            batch_size=self.train_batch_size,
            name="dev_set",
            show_progress_bar=True,
        )

        if loss == 'crossentropy':
            loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives))
        elif loss == 'lambda':
            loss = LambdaLoss(model=model, k=5)
        else:
            raise ValueError("Invalid loss type. Use 'crossentropy' or 'lambda'.")
        
        trainer = CrossEncoderTrainer(
            model=model,
            args=args,
            train_dataset=hard_train_dataset,
            eval_dataset=hard_eval_dataset,
            loss=loss,
            evaluator=reranking_evaluator,
    
        )

        # Train the model
        trainer.train()

        model.save(f"models/{self.model_to_finetune.split('/')[-1]}-finetuned{self.experiment_name}")

In [22]:
num_epochs = 2
# embedding_model_hard_negatives = "sentence-transformers/static-retrieval-mrl-en-v1"
# embedding_model_hard_negatives = "sentence-transformers/all-MiniLM-L6-v2"
# embedding_model_hard_negatives = "sentence-transformers/all-MiniLM-L12-v2"
embedding_model_hard_negatives = "BAAI/bge-small-en-v1.5"
loss = 'lambda'


experiment_name = f"_{embedding_model_hard_negatives.split('/')[-1]}_epochs{num_epochs}_{loss}loss"

In [30]:
num_hard_negatives = 5
hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives=num_hard_negatives, num_hard_negatives_eval=5, train_dataset=train_dataset, dev_dataset=dev_dataset, df_full=df_full, batch_size=64, loss=loss)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:42<00:00,  2.57it/s]
Batches: 100%|██████████| 201/201 [00:10<00:00, 20.10it/s]


Metric       Positive       Negative     Difference
Count          12,853         39,144               
Mean           0.8195         0.7622         0.0930
Median         0.8329         0.7814         0.0832
Std            0.0826         0.0433         0.0395
Min            0.3809         0.5027        -0.0655
25%            0.7696         0.7406         0.0584
50%            0.8329         0.7814         0.0832
75%            0.8834         0.7947         0.1173
Max            0.9881         0.7998         0.3004
Skipped 561,268 potential negatives (42.43%) due to the absolute_margin of 0.05.
Skipped 146,715 potential negatives (19.27%) due to the max_score of 0.8.
Could not find enough negatives for 25121 samples (39.09%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001

Batches: 100%|██████████| 121/121 [00:48<00:00,  2.52it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 21.80it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8033         0.0186
Median         0.8323         0.8101         0.0168
Std            0.0783         0.0505         0.0640
Min            0.4977         0.5529        -0.1856
25%            0.7717         0.7739        -0.0220
50%            0.8323         0.8101         0.0168
75%            0.8846         0.8388         0.0593
Max            0.9681         0.9451         0.2385
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:44<00:00,  2.73it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.31it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8117         0.0102
Median         0.8323         0.8156         0.0000
Std            0.0783         0.0543         0.0593
Min            0.4977         0.5593        -0.1856
25%            0.7717         0.7790        -0.0220
50%            0.8323         0.8156         0.0000
75%            0.8846         0.8466         0.0443
Max            0.9681         0.9681         0.2377


In [31]:
hard_train_dataset

Dataset({
    features: ['query', 'document', 'labels'],
    num_rows: 7885
})

In [32]:
hard_eval_dataset

Dataset({
    features: ['query', 'document', 'labels'],
    num_rows: 1400
})

In [33]:
hard_eval_dataset_evaluator

Dataset({
    features: ['query', 'document', 'negative_1', 'negative_2', 'negative_3', 'negative_4', 'negative_5'],
    num_rows: 1400
})

In [None]:
train_batch_size = 128
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-TinyBERT-L2-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss)

Eval steps: 22


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.365992,0.756345,0.756345,0.815318,0.568988,0.568988,0.595057
22,0.688500,1.36159,0.754798,0.754798,0.814166,0.568988,0.568988,0.595057
44,0.677600,1.346526,0.753988,0.753988,0.813541,0.568988,0.568988,0.595057
66,0.616600,1.34266,0.754012,0.754012,0.813548,0.568988,0.568988,0.595057
88,0.584200,1.340421,0.754702,0.754702,0.814057,0.568988,0.568988,0.595057
110,0.602200,1.327426,0.753988,0.753988,0.813503,0.568988,0.568988,0.595057


                                                                        

In [28]:
train_batch_size = 4
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L12-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss)

Eval steps: 750


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.317779,0.740036,0.740036,0.802646,0.568988,0.568988,0.595057
750,0.237300,1.212988,0.78375,0.78375,0.835814,0.568988,0.568988,0.595057
1500,0.088200,1.292978,0.788357,0.788357,0.839327,0.568988,0.568988,0.595057
2250,0.082200,1.354321,0.788595,0.788595,0.839466,0.568988,0.568988,0.595057
3000,0.071900,1.403692,0.78844,0.78844,0.839341,0.568988,0.568988,0.595057
3750,0.078000,1.422006,0.790524,0.790524,0.840919,0.568988,0.568988,0.595057


                                                                       

In [29]:
train_batch_size = 8
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L6-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss)

Eval steps: 375


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.235787,0.759298,0.759298,0.817261,0.568988,0.568988,0.595057
375,0.250400,1.146599,0.777667,0.777667,0.831213,0.568988,0.568988,0.595057
750,0.143700,1.216257,0.782012,0.782012,0.834577,0.568988,0.568988,0.595057
1125,0.120500,1.275048,0.78419,0.78419,0.836246,0.568988,0.568988,0.595057
1500,0.097000,1.318414,0.78556,0.78556,0.837306,0.568988,0.568988,0.595057
1875,0.108400,1.317869,0.785512,0.785512,0.837261,0.568988,0.568988,0.595057


                                                                       

In [34]:

train_batch_size = 1
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-electra-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss)

Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,,0.716131,0.716131,0.784809,0.568988,0.568988,0.595057
3000,0.275200,,0.774929,0.774929,0.829386,0.568988,0.568988,0.595057
6000,0.087500,,0.775321,0.775321,0.829746,0.568988,0.568988,0.595057
9000,0.075400,,0.775964,0.775964,0.830208,0.568988,0.568988,0.595057
12000,0.042900,,0.783214,0.783214,0.835638,0.568988,0.568988,0.595057
15000,0.040800,,0.775857,0.776214,0.83043,0.568988,0.568988,0.595057


                                                                       

In [35]:
num_epochs = 2

In [36]:
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-reranker-modernbert-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss)

Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,,0.838726,0.838726,0.878283,0.568988,0.568988,0.595057
3000,0.081700,,0.83031,0.832214,0.872487,0.568988,0.568988,0.595057
6000,0.015500,,0.647488,0.696476,0.798137,0.568988,0.568988,0.595057
9000,0.020400,,0.7075,0.754286,0.823683,0.568988,0.568988,0.595057
12000,0.007500,,0.717286,0.761036,0.828197,0.568988,0.568988,0.595057
15000,0.010500,,0.726417,0.768083,0.83095,0.568988,0.568988,0.595057


                                                                       

In [23]:
num_hard_negatives = 2
num_hard_negatives_eval = 2

hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives=num_hard_negatives, num_hard_negatives_eval=num_hard_negatives_eval, train_dataset=train_dataset, dev_dataset=dev_dataset, df_full=df_full, loss=loss, batch_size=64)

train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-multilingual-reranker-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, trust_remote_code=True, loss=loss)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:41<00:00,  2.65it/s]
Batches: 100%|██████████| 201/201 [00:09<00:00, 21.41it/s]


Metric       Positive       Negative     Difference
Count          12,853         15,746               
Mean           0.8195         0.7637         0.0914
Median         0.8329         0.7830         0.0814
Std            0.0826         0.0427         0.0388
Min            0.3809         0.5047        -0.0924
25%            0.7696         0.7428         0.0570
50%            0.8329         0.7830         0.0814
75%            0.8834         0.7956         0.1156
Max            0.9881         0.7998         0.2959
Skipped 561,379 potential negatives (42.44%) due to the absolute_margin of 0.05.
Skipped 146,681 potential negatives (19.27%) due to the max_score of 0.8.
Could not find enough negatives for 9960 samples (38.75%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 4 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 

Batches: 100%|██████████| 121/121 [00:45<00:00,  2.64it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.23it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          2,800               
Mean           0.8219         0.8132         0.0087
Median         0.8323         0.8191         0.0069
Std            0.0783         0.0501         0.0625
Min            0.4977         0.5707        -0.1856
25%            0.7717         0.7844        -0.0299
50%            0.8323         0.8192         0.0069
75%            0.8846         0.8478         0.0478
Max            0.9681         0.9451         0.2158
Setting range_max to 4 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:45<00:00,  2.64it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.19it/s]


Metric       Positive       Negative     Difference
Count           1,400          2,800               
Mean           0.8219         0.8294        -0.0075
Median         0.8323         0.8332        -0.0000
Std            0.0783         0.0559         0.0501
Min            0.4977         0.5787        -0.1856
25%            0.7717         0.7953        -0.0299
50%            0.8323         0.8332        -0.0000
75%            0.8846         0.8678         0.0004
Max            0.9681         0.9681         0.1977
Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.872254,0.874524,0.874524,0.906698,0.543929,0.543929,0.554684
3000,0.226200,2.880633,0.858929,0.858929,0.895107,0.543929,0.543929,0.554684
6000,0.095300,3.151024,0.857024,0.857024,0.893594,0.543929,0.543929,0.554684
9000,0.051500,3.20767,0.863929,0.863929,0.898713,0.543929,0.543929,0.554684
12000,0.037800,3.293346,0.86631,0.86631,0.900465,0.543929,0.543929,0.554684
15000,0.018200,3.282988,0.867024,0.867024,0.901009,0.543929,0.543929,0.554684


                                                                       

In [24]:
df_query_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-reranker-modernbert-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:55<00:00,  3.37it/s]

k = 1
Number of queries in top 1: 951
Number of queries not in top 1: 449
k = 5
Number of queries in top 5: 1094
Number of queries not in top 5: 306
k = 10
Number of queries in top 10: 1120
Number of queries not in top 10: 280
Evaluation results: {1: 0.6792857142857143, 5: 0.7203571428571428, 10: 0.7227089002267574}





In [25]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-multilingual-reranker-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_multilingual_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:03<00:00,  3.86it/s]

k = 1
Number of queries in top 1: 890
Number of queries not in top 1: 510
k = 5
Number of queries in top 5: 1088
Number of queries not in top 5: 312
k = 10
Number of queries in top 10: 1118
Number of queries not in top 10: 282
Evaluation results: {1: 0.6357142857142857, 5: 0.693952380952381, 10: 0.6968236961451246}





In [26]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L12-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM12_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [02:27<00:00,  9.48it/s]

k = 1
Number of queries in top 1: 874
Number of queries not in top 1: 526
k = 5
Number of queries in top 5: 1036
Number of queries not in top 5: 364
k = 10
Number of queries in top 10: 1088
Number of queries not in top 10: 312
Evaluation results: {1: 0.6242857142857143, 5: 0.6715, 10: 0.6765413832199546}





In [27]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L6-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM6_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [01:45<00:00, 13.24it/s]

k = 1
Number of queries in top 1: 861
Number of queries not in top 1: 539
k = 5
Number of queries in top 5: 1037
Number of queries not in top 5: 363
k = 10
Number of queries in top 10: 1090
Number of queries not in top 10: 310
Evaluation results: {1: 0.615, 5: 0.6652380952380952, 10: 0.6704767573696144}





In [28]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-TinyBERT-L2-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_tinyBERT_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [01:09<00:00, 20.11it/s]

k = 1
Number of queries in top 1: 794
Number of queries not in top 1: 606
k = 5
Number of queries in top 5: 995
Number of queries not in top 5: 405
k = 10
Number of queries in top 10: 1059
Number of queries not in top 10: 341
Evaluation results: {1: 0.5671428571428572, 5: 0.6222619047619047, 10: 0.6285807823129252}





In [29]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-electra-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet(f"data/reranked_results_electra_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:38<00:00,  4.14it/s]

k = 1
Number of queries in top 1: 872
Number of queries not in top 1: 528
k = 5
Number of queries in top 5: 1055
Number of queries not in top 5: 345
k = 10
Number of queries in top 10: 1106
Number of queries not in top 10: 294
Evaluation results: {1: 0.6228571428571429, 5: 0.6750357142857143, 10: 0.6799353741496599}



