In [1]:
import pandas as pd
import bm25s
from pathlib import Path
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from mxbai_rerank import MxbaiRerankV2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [3]:
data_path = Path('data/')

collection_data_path = data_path / 'subtask4b_collection_data.pkl' 
query_dev_data_path = data_path / 'subtask4b_query_tweets_dev.tsv'
query_train_data_path = data_path / 'subtask4b_query_tweets_train.tsv'

In [4]:
df_collection = pd.read_pickle(collection_data_path)
df_collection.head(5)

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000
993,ycxyn2a2,PMC,What was the primary mode of smallpox transmis...,10.3389/fcimb.2012.00150,PMC3509329,23226686,cc-by,The mode of infection transmission has profoun...,2012-11-29,"Milton, Donald K.",Front Cell Infect Microbiol,,,,ycxyn2a2,2012-11-29,1354147200
1053,zxe95qy9,PMC,"Lessons from the History of Quarantine, from P...",10.3201/eid1902.120312,PMC3559034,23343512,no-cc,"In the new millennium, the centuries-old strat...",2013-02-03,"Tognotti, Eugenia",Emerg Infect Dis,,,,zxe95qy9,2013-02-03,1359849600


In [5]:
df_query_dev = pd.read_csv(query_dev_data_path, sep='\t')
df_query_train = pd.read_csv(query_train_data_path, sep='\t')
display(df_query_dev)
display(df_query_train)

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
1395,14193,Residents at high risk of covid-19: effectiven...,0gn3b98n
1396,14196,"61% of teenagers hospitalized for covid were ""...",25bdifv6
1397,14203,"""fresh evidence backing melatonin against covi...",qn6wawxk
1398,14233,"the vaccine doesn't halt the spread, it is pro...",3u3i5myh


Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69
...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [6]:
df_query_combined = pd.concat([df_query_dev, df_query_train], ignore_index=True)
df_query_combined

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
14248,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
14249,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
14250,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
14251,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [7]:
df_collection['last_names'] = df_collection['authors'].str.split(';').apply(
    lambda authors: [author.split(',')[0].strip() for author in authors] if isinstance(authors, list) else authors
)
df_collection['last_names'] = df_collection['last_names'].apply(
    lambda x: '; '.join(x) if isinstance(x, list) else x
)
df_collection['publish_year'] = df_collection['publish_time'].str.split('-').str[0]

In [8]:
def initialize_bm25(corpus: list[str], cord_uids:list[str], k1=1.5, b=0.75, method='lucene', stemmer=None):
    tokenized_corpus = bm25s.tokenize(corpus, stemmer=stemmer)
    bm25 = bm25s.BM25(corpus=cord_uids, k1=k1, b=b, method=method)
    bm25.index(tokenized_corpus)
    return bm25

In [9]:
def experiment_single_bm25(df_collection, df_query, k1=1.5, b=0.75, stemmer=None, k=10):
    corpus = df_collection.apply(
        lambda x: f"{x['title']} {x['abstract']} {x['last_names']} {x['journal']} {x['publish_year']}", axis=1
    ).tolist()
    bm25 = initialize_bm25(corpus, df_collection['cord_uid'].tolist(), k1, b, stemmer=stemmer)
    tokenized_queries = bm25s.tokenize(df_query['tweet_text'], stemmer=stemmer)
    doc_scores = bm25.retrieve(tokenized_queries, n_threads=-1, k=k)
    df_query['bm25_topk'] = doc_scores.documents.tolist()
    
    return df_query

In [10]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
        print(f"{k = }")
        in_topx = data["in_topx"] > 0
        print(f"Number of queries in top {k}: {in_topx.sum()}")
        print(f"Number of queries not in top {k}: {len(data) - in_topx.sum()}")
    return d_performance

In [11]:
def evaluate_experiment(df_query_train, df_query_dev, experiment_name, list_k=[1, 5, 10]):
    results = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, train: {results}")
    results = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, dev: {results}")
    return results

In [12]:

stemmer = None
df_query_train_single = experiment_single_bm25(df_collection, df_query_train, stemmer=stemmer, k=1000)
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=1000)
evaluate_experiment(df_query_train_single, df_query_dev_single, "Single BM25 with all features", list_k=[1, 5, 10, 100, 200, 500, 700, 1000, 2000, 5000])

                                                                            

k = 1
Number of queries in top 1: 7542
Number of queries not in top 1: 5311
k = 5
Number of queries in top 5: 9094
Number of queries not in top 5: 3759
k = 10
Number of queries in top 10: 9630
Number of queries not in top 10: 3223
k = 100
Number of queries in top 100: 11156
Number of queries not in top 100: 1697
k = 200
Number of queries in top 200: 11512
Number of queries not in top 200: 1341
k = 500
Number of queries in top 500: 11956
Number of queries not in top 500: 897
k = 700
Number of queries in top 700: 12089
Number of queries not in top 700: 764
k = 1000
Number of queries in top 1000: 12210
Number of queries not in top 1000: 643
k = 2000
Number of queries in top 2000: 12210
Number of queries not in top 2000: 643
k = 5000
Number of queries in top 5000: 12210
Number of queries not in top 5000: 643
Results for Single BM25 with all features, train: {1: 0.5867890764801992, 5: 0.6347999170102959, 10: 0.6403735648153294, 100: 0.6449850219400036, 200: 0.6451818481542875, 500: 0.645294

{1: 0.5921428571428572,
 5: 0.6397619047619048,
 10: 0.6450986394557824,
 100: 0.6495240324689373,
 200: 0.6497567832248164,
 500: 0.6498352020281756,
 700: 0.6498638803520272,
 1000: 0.6498750349577517,
 2000: 0.6498750349577517,
 5000: 0.6498750349577517}

We will use BM25 as a baseline ranking model. The experiment above shouws us that there is no improvement after the first 1k (doesnt matter whether we take top 1k or top 2k etc.) -> so for each query we will prefilter the documents with BM25 to the top 1k and then the reranker will take over.

In [13]:
def initialize_reranker(model_name, torch_dtype=torch.float16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True, torch_dtype=torch.float16
    )
    if torch.cuda.is_available():
        print("Using GPU")
        model = model.to('cuda')
    model.eval()
    return tokenizer, model

In [14]:
def rerank_with_alibaba(df_query, df_collection, tokenizer, model, k=10, max_length=512, batch_size=32):
    reranked_results = []
    reranked_scores = []
    reranked_docs = []

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk'] 

        pairs = [
            [
                query_text, 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]
        
        batch_scores = []
        batch_docs = []

        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            with torch.no_grad():
                inputs = tokenizer(batch_pairs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
                if torch.cuda.is_available():
                    inputs = {key: value.to('cuda') for key, value in inputs.items()}
                scores = model(**inputs, return_dict=True).logits.view(-1).float()

            batch_scores.extend(scores.tolist())
            batch_docs.extend(topk_docs[i:i+batch_size])


        reranked = sorted(zip(batch_docs, batch_scores), key=lambda x: x[1], reverse=True)
        reranked_results.append([doc_id for doc_id, _ in reranked[:k]])
        reranked_scores.append([score for _, score in reranked])
        reranked_docs.append([doc_id for doc_id, _ in reranked])

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_docs
    df_query['reranked_scores'] = reranked_scores
    return df_query

In [15]:
def rerank_mxbai(df_query, df_collection, model_name, k=10, batch_size=1):
    reranked_results = []    
    reranked_scores = []

    model = MxbaiRerankV2(model_name, torch_dtype=torch.float16)

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk']  # Get top-k BM25 results for the query

        documents = [
            [
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]

        results = model.rank(query_text, documents, return_documents=True, top_k=k, batch_size=batch_size)
        reranked = [topk_docs[result.index] for result in results]
        reranked_score = [result.score for result in results]

        reranked_results.append(reranked)
        reranked_scores.append(reranked_score)

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_results
    df_query['reranked_scores'] = reranked_scores
    
    return df_query

In [16]:
def evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10]):
    return get_performance_mrr(df_query, col_gold, col_pred, list_k)

## Experiments

In [17]:
df_query_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-reranker-modernbert-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [2:04:03<00:00,  1.91it/s] 


k = 1
Number of queries in top 1: 9658
Number of queries not in top 1: 4595
k = 5
Number of queries in top 5: 11240
Number of queries not in top 5: 3013
k = 10
Number of queries in top 10: 11565
Number of queries not in top 10: 2688
Evaluation results: {1: np.float64(0.6776117308636779), 5: np.float64(0.7225660094950771), 10: np.float64(0.7256685030943082)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-multilingual-reranker-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_multilingual.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:49:21<00:00,  2.17it/s]


k = 1
Number of queries in top 1: 9026
Number of queries not in top 1: 5227
k = 5
Number of queries in top 5: 10949
Number of queries not in top 5: 3304
k = 10
Number of queries in top 10: 11361
Number of queries not in top 10: 2892
Evaluation results: {1: np.float64(0.6332701887321968), 5: np.float64(0.6880668397296476), 10: np.float64(0.6920471490825101)}


In [19]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=200)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-TinyBERT-L2-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_tinyBERT.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[1.2587890625, 0.71435546875, 0.4404296875, 0....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[3.333984375, 0.349853515625, 0.17138671875, 0...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[0.309814453125, -0.06854248046875, -0.1201171...",1.0
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[0.83642578125, 0.22509765625, 0.1030883789062...",1.0
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[0.22314453125, 0.08758544921875, -0.127197265...",1.0


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:21<00:00,  2.63it/s]


k = 1
Number of queries in top 1: 7438
Number of queries not in top 1: 6815
k = 5
Number of queries in top 5: 9521
Number of queries not in top 5: 4732
k = 10
Number of queries in top 10: 10215
Number of queries not in top 10: 4038
Evaluation results: {1: np.float64(0.5218550480600576), 5: np.float64(0.5785320049580205), 10: np.float64(0.5850188876972712)}


In [20]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=100)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[7.078125, 6.046875, 5.74609375, 5.4140625, 5....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[6.75390625, 1.7626953125, 1.494140625, 1.3847...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[0.5537109375, -0.328857421875, -0.48095703125...",0.333333
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[-0.0164794921875, -1.2412109375, -1.251953125...",0.333333
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[4.51171875, 4.36328125, 4.16015625, 2.578125,...",0.5


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:00<00:00,  2.64it/s]


k = 1
Number of queries in top 1: 7613
Number of queries not in top 1: 6640
k = 5
Number of queries in top 5: 9648
Number of queries not in top 5: 4605
k = 10
Number of queries in top 10: 10355
Number of queries not in top 10: 3898
Evaluation results: {1: np.float64(0.5341331649477303), 5: np.float64(0.5887462288640988), 10: np.float64(0.5954533659858855)}


In [21]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=40)
model_name = "cross-encoder/ms-marco-electra-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet("data/reranked_results_electra.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:20:58<00:00,  2.93it/s]


k = 1
Number of queries in top 1: 7032
Number of queries not in top 1: 7221
k = 5
Number of queries in top 5: 9334
Number of queries not in top 5: 4919
k = 10
Number of queries in top 10: 10228
Number of queries not in top 10: 4025
Evaluation results: {1: np.float64(0.49336981688065673), 5: np.float64(0.5558163193713604), 10: np.float64(0.5641402422659)}


In [18]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=25)
df_query = rerank_mxbai(df_query_dev_single, df_collection, "mixedbread-ai/mxbai-rerank-base-v2", batch_size=1)
df_query.to_parquet("data/reranked_results_mxbai.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Reranking:   0%|          | 0/14253 [00:00<?, ?it/s]                        You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Reranking: 100%|██████████| 14253/14253 [4:14:34<00:00,  1.07s/it]  


k = 1
Number of queries in top 1: 9784
Number of queries not in top 1: 4469
k = 5
Number of queries in top 5: 11050
Number of queries not in top 5: 3203
k = 10
Number of queries in top 10: 11274
Number of queries not in top 10: 2979
Evaluation results: {1: np.float64(0.6864519750228022), 5: np.float64(0.7231974555064431), 10: np.float64(0.7253506362904384)}


## Finetune

In [17]:
from datasets import Dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss, MultipleNegativesRankingLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer


In [18]:
df_collection['answer'] = df_collection.apply(
    lambda row: f"{row['title']} {row['abstract']} {row['last_names']} {row['journal']}", axis=1
)
df_collection.head(3)   

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet,last_names,publish_year,answer
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600,van der Sande; Teunis; Sabel,2008,Professional and Home-Made Face Masks Reduce E...
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800,Li; Blakeley; Smith?,2011,The Failure of R (0) The basic reproductive ra...
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000,Singh; Sharma; Patel,2012,Pulmonary sequelae in a patient recovered from...


In [19]:
df_train = df_query_train.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_dev = df_query_dev.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_full = pd.concat([df_train, df_dev], ignore_index=True)
df_full.head(3)

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,answer
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,"[htlvpvz5, h7hj64q5, 4aps0kvp, 5tkyir3r, 32z7b...",1.0,Oral Management in Rehabilitation Medicine: Or...
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,"[maj8r6ti, bjvg2ivr, 7tto4hr7, 2cwvga0k, 46je8...",0.003145,Variation in racial/ethnic disparities in COVI...
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"[jtwb17u8, veeavho5, jbpmbm9m, 8hkxbxz9, 32v44...",1.0,Effect of non-pharmaceutical interventions for...


In [20]:
train_dataset = Dataset.from_dict({
    "query": df_train['tweet_text'].tolist(),
    "document": df_train['answer'].tolist(),
})

dev_dataset = Dataset.from_dict({
    "query": df_dev['tweet_text'].tolist(),
    "document": df_dev['answer'].tolist(),
})
full_dataset = Dataset.from_dict({
    "query": df_full['tweet_text'].tolist(),
    "document": df_full['answer'].tolist(),
})

In [24]:
class ExperimentRunner:
    def __init__(self, embedding_model_hard_negatives, model_to_finetune, train_batch_size, num_epochs, max_length, num_hard_negatives, num_hard_negatives_eval):
        self.embedding_model_hard_negatives = embedding_model_hard_negatives
        self.model_to_finetune = model_to_finetune
        self.train_batch_size = train_batch_size
        self.num_epochs = num_epochs
        self.max_length = max_length
        self.num_hard_negatives = num_hard_negatives
        self.num_hard_negatives_eval = num_hard_negatives_eval
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def run_experiment(self, train_dataset, dev_dataset, df_full, trust_remote_code=False):
        embedding_model = SentenceTransformer(self.embedding_model_hard_negatives, device=self.device)

        hard_train_dataset = mine_hard_negatives(
            train_dataset,
            embedding_model,
            num_negatives=self.num_hard_negatives,  # How many negatives per question-answer pair
            range_min=5,  # Skip the x most similar samples
            range_max=100,  # Consider only the x most similar samples
            max_score=0.8,  # Only consider samples with a similarity score of at most x
            margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
            sampling_strategy="top",  # Randomly sample negatives from the range
            batch_size=4096,  # Use a batch size of 4096 for the embedding model
            output_format="labeled-pair",  # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
        )

        hard_eval_dataset = mine_hard_negatives(
            dev_dataset,
            embedding_model,
            corpus=df_full["answer"],  # Use the full dataset as the corpus
            num_negatives=self.num_hard_negatives_eval,  # How many negatives per question-answer pair
            batch_size=4096,  # Use a batch size of 4096 for the embedding model
            output_format="n-tuple",  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
            include_positives=True,  # Key: Include the positive answer in the list of negatives
        )
        
        model = CrossEncoder(self.model_to_finetune, max_length=self.max_length, device=self.device, trust_remote_code=trust_remote_code)

        args = CrossEncoderTrainingArguments(
            # Required parameter:
            output_dir=f"model/{self.model_to_finetune.split("/")[-1]}",
            # Optional training parameters:
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.train_batch_size,
            per_device_eval_batch_size=self.train_batch_size,
            learning_rate=2e-5,
            warmup_ratio=0.1,
            fp16=True,  
            bf16=False, 
            batch_sampler=BatchSamplers.NO_DUPLICATES, 
            eval_strategy="steps",
            eval_steps=20000/self.train_batch_size,
            save_strategy="steps",
            save_steps=20000/self.train_batch_size,
            save_total_limit=10,
            # logging_steps=2000/self.train_batch_size,
            run_name=self.model_to_finetune.split("/")[-1],
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )
        
        reranking_evaluator = CrossEncoderRerankingEvaluator(
            samples=[
                {
                    "query": sample["query"],
                    "positive": [sample["document"]],
                    "documents": [sample[column_name] for column_name in hard_eval_dataset.column_names if 'negative' in column_name],
                }
                for sample in hard_eval_dataset
            ],
            batch_size=self.train_batch_size,
            name="dev_set",
            show_progress_bar=True,
        )
        # loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(self.num_hard_negatives))
        loss = MultipleNegativesRankingLoss(model=model, num_negatives=self.num_hard_negatives)

        trainer = CrossEncoderTrainer(
            model=model,
            args=args,
            train_dataset=hard_train_dataset,
            eval_dataset=dev_dataset,
            loss=loss,
            evaluator=reranking_evaluator,
    
        )

        # Train the model
        trainer.train()

        model.save(f"models/{self.model_to_finetune.split('/')[-1]}-finetuned")

In [25]:
num_epochs = 2
embedding_model_hard_negatives = "sentence-transformers/static-retrieval-mrl-en-v1"

In [26]:
num_hard_negatives = 5
num_hard_negatives_eval = 5
train_batch_size = 32
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-TinyBERT-L2-v2"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.59it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 25.29it/s]


Metric       Positive       Negative     Difference
Count          12,853         45,823               
Mean           0.5263         0.4702         0.1262
Median         0.5440         0.4759         0.0969
Std            0.1639         0.1055         0.0809
Min           -0.0783         0.1255        -0.2308
25%            0.4171         0.3980         0.0648
50%            0.5440         0.4759         0.0969
75%            0.6514         0.5476         0.1625
Max            0.9287         0.7643         0.6667
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 18442 samples (28.70%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.84it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.5248         0.5402        -0.0154
Median         0.5435         0.5493        -0.0000
Std            0.1603         0.1151         0.1361
Min            0.0295         0.1618        -0.6029
25%            0.4099         0.4616        -0.0925
50%            0.5438         0.5493        -0.0000
75%            0.6468         0.6234         0.0614
Max            0.8905         0.8905         0.5734


Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
625,1.625,0.338719,0.764464,0.764464,0.821146,0.465655,0.465655,0.491332
1250,1.5489,0.261719,0.767857,0.767857,0.823813,0.465655,0.465655,0.491332
1875,1.5338,0.21333,0.77575,0.77575,0.829801,0.465655,0.465655,0.491332
2500,1.5132,0.208573,0.778143,0.778143,0.831634,0.465655,0.465655,0.491332
3125,1.511,0.206474,0.777893,0.777893,0.831443,0.465655,0.465655,0.491332


                                                                        

In [27]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=20)
display(df_query_dev_single.head(5))
model_name = "models/ms-marco-TinyBERT-L2-v2-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Split strings:   0%|          | 0/7718 [00:00<?, ?it/s]

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.012658
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",0.5
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",1.0
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",0.2


Using GPU


Reranking: 100%|██████████| 1400/1400 [00:47<00:00, 29.42it/s]

k = 1
Number of queries in top 1: 831
Number of queries not in top 1: 569
k = 5
Number of queries in top 5: 1023
Number of queries not in top 5: 377
k = 10
Number of queries in top 10: 1072
Number of queries not in top 10: 328
Evaluation results: {1: 0.5935714285714285, 5: 0.6484761904761904, 10: 0.6532862811791383}





In [28]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=20)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-TinyBERT-L2-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[atji1xge, 8t2tic9n, 59up4v56, o4vvlmr4, styav...","[atji1xge, 8t2tic9n, 59up4v56, o4vvlmr4, styav...","[2.712890625, 2.529296875, 2.423828125, 1.1914..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, 8n4zf9oo, f0kcf2y0, yrowv62k, jswlq...","[r58aohnu, 8n4zf9oo, f0kcf2y0, yrowv62k, jswlq...","[3.83984375, -0.86962890625, -1.142578125, -1...."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, o47v5vgw, gruir7aw, cgc0v1dg, o877u...","[sts48u9i, o47v5vgw, gruir7aw, cgc0v1dg, o877u...","[0.96044921875, 0.75927734375, 0.7197265625, 0..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",1.0,"[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, z795y...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, z795y...","[3.14453125, 1.6796875, 0.9404296875, 0.323730..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",1.0,"[ybwwmyqy, vabb2f26, sxx3yid9, ouvq2wpq, 3l6ip...","[ybwwmyqy, vabb2f26, sxx3yid9, ouvq2wpq, 3l6ip...","[4.60546875, 3.138671875, 2.51953125, 2.341796..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [00:47<00:00, 29.57it/s]


k = 1
Number of queries in top 1: 794
Number of queries not in top 1: 606
k = 5
Number of queries in top 5: 1007
Number of queries not in top 5: 393
k = 10
Number of queries in top 10: 1061
Number of queries not in top 10: 339
Evaluation results: {1: 0.5671428571428572, 5: 0.6254047619047619, 10: 0.6306621315192744}


In [29]:
num_hard_negatives = 5
num_hard_negatives_eval = 5
train_batch_size = 4
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L12-v2"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.66it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 25.92it/s]


Metric       Positive       Negative     Difference
Count          12,853         45,823               
Mean           0.5263         0.4702         0.1262
Median         0.5440         0.4759         0.0969
Std            0.1639         0.1055         0.0809
Min           -0.0783         0.1255        -0.2308
25%            0.4171         0.3980         0.0648
50%            0.5440         0.4759         0.0969
75%            0.6514         0.5476         0.1625
Max            0.9287         0.7643         0.6667
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 18442 samples (28.70%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.14it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.5248         0.5402        -0.0154
Median         0.5435         0.5493        -0.0000
Std            0.1603         0.1151         0.1361
Min            0.0295         0.1618        -0.6029
25%            0.4099         0.4616        -0.0925
50%            0.5438         0.5493        -0.0000
75%            0.6468         0.6234         0.0614
Max            0.8905         0.8905         0.5734


Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
5000,1.196,0.166516,0.814619,0.814619,0.859518,0.465655,0.465655,0.491332
10000,1.1742,0.185563,0.824905,0.824905,0.867175,0.465655,0.465655,0.491332
15000,1.1725,0.127704,0.828429,0.828429,0.869995,0.465655,0.465655,0.491332
20000,1.1609,0.12567,0.831214,0.831214,0.87205,0.465655,0.465655,0.491332
25000,1.1115,0.134499,0.838464,0.838464,0.877534,0.465655,0.465655,0.491332


                                                                       

In [30]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=20)
display(df_query_dev_single.head(5))
model_name = "models/ms-marco-MiniLM-L12-v2-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM12_finetuned.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[atji1xge, trrg1mnw, o4vvlmr4, 8t2tic9n, dcfqv...","[atji1xge, trrg1mnw, o4vvlmr4, 8t2tic9n, dcfqv...","[5.74609375, 5.33203125, 5.13671875, 5.0078125..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, tu1vevx9, yrowv62k, f0kcf2y0, jswlq...","[r58aohnu, tu1vevx9, yrowv62k, f0kcf2y0, jswlq...","[6.75390625, 1.083984375, 0.7802734375, 0.1711..."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, o47v5vgw, gruir7aw, 3xw4qjoy, o877u...","[sts48u9i, o47v5vgw, gruir7aw, 3xw4qjoy, o877u...","[-0.48095703125, -1.2470703125, -1.6162109375,..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",0.5,"[bn22k0p3, 3sr2exq9, k0f4cwig, sv48gjkk, jbmzv...","[bn22k0p3, 3sr2exq9, k0f4cwig, sv48gjkk, jbmzv...","[-1.2412109375, -1.251953125, -1.4345703125, -..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",0.5,"[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, sxx3y...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, sxx3y...","[4.51171875, 4.36328125, 4.16015625, 2.578125,..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [01:41<00:00, 13.86it/s]

k = 1
Number of queries in top 1: 931
Number of queries not in top 1: 469
k = 5
Number of queries in top 5: 1066
Number of queries not in top 5: 334
k = 10
Number of queries in top 10: 1094
Number of queries not in top 10: 306
Evaluation results: {1: 0.665, 5: 0.7031785714285714, 10: 0.7058784013605442}





In [31]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=20)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[59up4v56, 8t2tic9n, o4vvlmr4, mb18fj8a, 5hxsa...","[59up4v56, 8t2tic9n, o4vvlmr4, mb18fj8a, 5hxsa...","[1.8076171875, 1.6572265625, 1.6083984375, 1.0..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, 8n4zf9oo, yrowv62k, j1ucrhd7, f0kcf...","[r58aohnu, 8n4zf9oo, yrowv62k, j1ucrhd7, f0kcf...","[3.283203125, -3.474609375, -3.501953125, -3.7..."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, o47v5vgw, gruir7aw, o877uul1, vtcq6...","[sts48u9i, o47v5vgw, gruir7aw, o877uul1, vtcq6...","[3.40234375, 1.32421875, 1.22265625, 0.5068359..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",1.0,"[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, z795y...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, z795y...","[3.103515625, 1.1416015625, 0.91064453125, 0.3..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",1.0,"[ybwwmyqy, vabb2f26, 3l6ipiwk, lzddnb8j, ouvq2...","[ybwwmyqy, vabb2f26, 3l6ipiwk, lzddnb8j, ouvq2...","[3.017578125, 1.578125, 1.1640625, 0.990234375..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [01:41<00:00, 13.78it/s]

k = 1
Number of queries in top 1: 785
Number of queries not in top 1: 615
k = 5
Number of queries in top 5: 987
Number of queries not in top 5: 413
k = 10
Number of queries in top 10: 1045
Number of queries not in top 10: 355
Evaluation results: {1: 0.5607142857142857, 5: 0.6167857142857143, 10: 0.6224671201814059}





In [32]:
num_hard_negatives = 5
num_hard_negatives_eval = 5
train_batch_size = 8
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L6-v2"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.65it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 26.53it/s]


Metric       Positive       Negative     Difference
Count          12,853         45,823               
Mean           0.5263         0.4702         0.1262
Median         0.5440         0.4759         0.0969
Std            0.1639         0.1055         0.0809
Min           -0.0783         0.1255        -0.2308
25%            0.4171         0.3980         0.0648
50%            0.5440         0.4759         0.0969
75%            0.6514         0.5476         0.1625
Max            0.9287         0.7643         0.6667
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 18442 samples (28.70%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.06it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.5248         0.5402        -0.0154
Median         0.5435         0.5493        -0.0000
Std            0.1603         0.1151         0.1361
Min            0.0295         0.1618        -0.6029
25%            0.4099         0.4616        -0.0925
50%            0.5438         0.5493        -0.0000
75%            0.6468         0.6234         0.0614
Max            0.8905         0.8905         0.5734


Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
2500,1.4923,0.175127,0.808929,0.808929,0.8553,0.465655,0.465655,0.491332
5000,1.4856,0.313195,0.822571,0.822571,0.8655,0.465655,0.465655,0.491332
7500,1.4883,0.253402,0.827405,0.827405,0.86921,0.465655,0.465655,0.491332
10000,1.4719,0.19662,0.834333,0.834333,0.874425,0.465655,0.465655,0.491332
12500,1.4405,0.217848,0.838333,0.838333,0.877416,0.465655,0.465655,0.491332


                                                                       

In [33]:
num_hard_negatives = 5
num_hard_negatives_eval = 5
train_batch_size = 2
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-electra-base"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.73it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 28.02it/s]


Metric       Positive       Negative     Difference
Count          12,853         45,823               
Mean           0.5263         0.4702         0.1262
Median         0.5440         0.4759         0.0969
Std            0.1639         0.1055         0.0809
Min           -0.0783         0.1255        -0.2308
25%            0.4171         0.3980         0.0648
50%            0.5440         0.4759         0.0969
75%            0.6514         0.5476         0.1625
Max            0.9287         0.7643         0.6667
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 18442 samples (28.70%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 61.82it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.5248         0.5402        -0.0154
Median         0.5435         0.5493        -0.0000
Std            0.1603         0.1151         0.1361
Min            0.0295         0.1618        -0.6029
25%            0.4099         0.4616        -0.0925
50%            0.5438         0.5493        -0.0000
75%            0.6468         0.6234         0.0614
Max            0.8905         0.8905         0.5734


Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
10000,0.6495,0.072873,0.775345,0.775345,0.829819,0.465655,0.465655,0.491332
20000,0.6827,0.687636,0.64494,0.64494,0.73156,0.465655,0.465655,0.491332
30000,0.6618,0.398979,0.413202,0.413321,0.554605,0.465655,0.465655,0.491332
40000,0.6219,0.06923,0.749655,0.749655,0.81018,0.465655,0.465655,0.491332
50000,0.6015,0.054134,0.762869,0.762869,0.82024,0.465655,0.465655,0.491332


                                                                       

In [34]:
num_hard_negatives = 5
num_hard_negatives_eval = 5
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-reranker-modernbert-base"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.69it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 28.15it/s]


Metric       Positive       Negative     Difference
Count          12,853         45,823               
Mean           0.5263         0.4702         0.1262
Median         0.5440         0.4759         0.0969
Std            0.1639         0.1055         0.0809
Min           -0.0783         0.1255        -0.2308
25%            0.4171         0.3980         0.0648
50%            0.5440         0.4759         0.0969
75%            0.6514         0.5476         0.1625
Max            0.9287         0.7643         0.6667
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 18442 samples (28.70%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.83it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.5248         0.5402        -0.0154
Median         0.5435         0.5493        -0.0000
Std            0.1603         0.1151         0.1361
Min            0.0295         0.1618        -0.6029
25%            0.4099         0.4616        -0.0925
50%            0.5438         0.5493        -0.0000
75%            0.6468         0.6234         0.0614
Max            0.8905         0.8905         0.5734


Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
20000,0.0,0.0,0.867274,0.867274,0.899855,0.465655,0.465655,0.491332
40000,0.0,0.0,0.867274,0.867274,0.899855,0.465655,0.465655,0.491332
60000,0.0,0.0,0.867274,0.867274,0.899855,0.465655,0.465655,0.491332
80000,0.0,0.0,0.867274,0.867274,0.899855,0.465655,0.465655,0.491332
100000,0.0,0.0,0.867274,0.867274,0.899855,0.465655,0.465655,0.491332


                                                                       

In [35]:
num_hard_negatives = 3
num_hard_negatives_eval = 3
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-multilingual-reranker-base"

exp_runner = ExperimentRunner(
    embedding_model_hard_negatives,
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    num_hard_negatives,
    num_hard_negatives_eval
)
exp_runner.run_experiment(train_dataset, dev_dataset, df_full, trust_remote_code=True)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.62it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 26.32it/s]


Metric       Positive       Negative     Difference
Count          12,853         27,530               
Mean           0.5263         0.4731         0.1231
Median         0.5440         0.4789         0.0933
Std            0.1639         0.1057         0.0797
Min           -0.0783         0.1277        -0.2308
25%            0.4171         0.4007         0.0631
50%            0.5440         0.4789         0.0933
75%            0.6514         0.5507         0.1581
Max            0.9287         0.7643         0.6512
Skipped 493,639 potential negatives (37.32%) due to the absolute_margin of 0.05.
Skipped 5 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 11029 samples (28.60%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 5 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positi

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.89it/s]


Metric       Positive       Negative     Difference
Count           1,400          4,200               
Mean           0.5248         0.5580        -0.0332
Median         0.5435         0.5668        -0.0094
Std            0.1603         0.1147         0.1281
Min            0.0295         0.1662        -0.6029
25%            0.4099         0.4813        -0.1031
50%            0.5438         0.5669        -0.0094
75%            0.6468         0.6411         0.0238
Max            0.8905         0.8905         0.5467


Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
20000,0.0,0.0,0.876369,0.876369,0.907534,0.456548,0.456548,0.474888
40000,0.0,0.0,0.876369,0.876369,0.907534,0.456548,0.456548,0.474888
60000,0.0,0.0,0.876369,0.876369,0.907534,0.456548,0.456548,0.474888
80000,0.0,0.0,0.876369,0.876369,0.907534,0.456548,0.456548,0.474888


                                                                       

Note that the metrics in the following cells are overly optimistic, since we evaluate on the whole data, which includes the training data. We do this on the whole data so that we have enough data for the ensemble learning part.

In [36]:
df_query_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/gte-reranker-modernbert-base-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_finetuned.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:12:52<00:00,  3.26it/s]


k = 1
Number of queries in top 1: 9609
Number of queries not in top 1: 4644
k = 5
Number of queries in top 5: 11089
Number of queries not in top 5: 3164
k = 10
Number of queries in top 10: 11349
Number of queries not in top 10: 2904
Evaluation results: {1: 0.6741738581351294, 5: 0.7166327556771674, 10: 0.7191217053719685}


In [37]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/gte-multilingual-reranker-base-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_multilingual_finetuned.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:02:52<00:00,  3.78it/s]


k = 1
Number of queries in top 1: 9021
Number of queries not in top 1: 5232
k = 5
Number of queries in top 5: 10842
Number of queries not in top 5: 3411
k = 10
Number of queries in top 10: 11232
Number of queries not in top 10: 3021
Evaluation results: {1: 0.6329193853925489, 5: 0.6854755723941158, 10: 0.6892080085618287}


In [38]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/ms-marco-MiniLM-L12-v2-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM12_finetuned.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [25:38<00:00,  9.26it/s]


k = 1
Number of queries in top 1: 9429
Number of queries not in top 1: 4824
k = 5
Number of queries in top 5: 11000
Number of queries not in top 5: 3253
k = 10
Number of queries in top 10: 11312
Number of queries not in top 10: 2941
Evaluation results: {1: 0.6615449379078089, 5: 0.7066407072195328, 10: 0.7096312778484952}


In [39]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/ms-marco-MiniLM-L6-v2-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM6_finetuned.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [18:15<00:00, 13.01it/s]


k = 1
Number of queries in top 1: 8935
Number of queries not in top 1: 5318
k = 5
Number of queries in top 5: 10804
Number of queries not in top 5: 3449
k = 10
Number of queries in top 10: 11201
Number of queries not in top 10: 3052
Evaluation results: {1: 0.6268855679506069, 5: 0.6795516733319301, 10: 0.6833366186344506}


In [40]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/ms-marco-TinyBERT-L2-v2-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_tinyBERT_finetuned.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [11:47<00:00, 20.15it/s]


k = 1
Number of queries in top 1: 8159
Number of queries not in top 1: 6094
k = 5
Number of queries in top 5: 10348
Number of queries not in top 5: 3905
k = 10
Number of queries in top 10: 10970
Number of queries not in top 10: 3283
Evaluation results: {1: 0.5724408896372694, 5: 0.6330105942608573, 10: 0.6389655923175181}


In [41]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=30)
model_name = "models/ms-marco-electra-base-finetuned"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet("data/reranked_results_electra_finetuned.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [59:01<00:00,  4.02it/s]


k = 1
Number of queries in top 1: 7957
Number of queries not in top 1: 6296
k = 5
Number of queries in top 5: 10281
Number of queries not in top 5: 3972
k = 10
Number of queries in top 10: 10933
Number of queries not in top 10: 3320
Evaluation results: {1: 0.5582684347154985, 5: 0.6218129516592997, 10: 0.6280638495488}
