In [14]:
import pandas as pd
import bm25s
from pathlib import Path
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from mxbai_rerank import MxbaiRerankV2

In [15]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070'

In [16]:
data_path = Path('data/')

collection_data_path = data_path / 'subtask4b_collection_data.pkl' 
query_dev_data_path = data_path / 'subtask4b_query_tweets_dev.tsv'
query_train_data_path = data_path / 'subtask4b_query_tweets_train.tsv'

In [17]:
df_collection = pd.read_pickle(collection_data_path)
df_collection.head(5)

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000
993,ycxyn2a2,PMC,What was the primary mode of smallpox transmis...,10.3389/fcimb.2012.00150,PMC3509329,23226686,cc-by,The mode of infection transmission has profoun...,2012-11-29,"Milton, Donald K.",Front Cell Infect Microbiol,,,,ycxyn2a2,2012-11-29,1354147200
1053,zxe95qy9,PMC,"Lessons from the History of Quarantine, from P...",10.3201/eid1902.120312,PMC3559034,23343512,no-cc,"In the new millennium, the centuries-old strat...",2013-02-03,"Tognotti, Eugenia",Emerg Infect Dis,,,,zxe95qy9,2013-02-03,1359849600


In [18]:
df_query_dev = pd.read_csv(query_dev_data_path, sep='\t')
df_query_train = pd.read_csv(query_train_data_path, sep='\t')
display(df_query_dev)
display(df_query_train)

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
1395,14193,Residents at high risk of covid-19: effectiven...,0gn3b98n
1396,14196,"61% of teenagers hospitalized for covid were ""...",25bdifv6
1397,14203,"""fresh evidence backing melatonin against covi...",qn6wawxk
1398,14233,"the vaccine doesn't halt the spread, it is pro...",3u3i5myh


Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69
...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [19]:
df_query_combined = pd.concat([df_query_dev, df_query_train], ignore_index=True)
df_query_combined

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
14248,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
14249,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
14250,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
14251,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [20]:
df_collection['last_names'] = df_collection['authors'].str.split(';').apply(
    lambda authors: [author.split(',')[0].strip() for author in authors] if isinstance(authors, list) else authors
)
df_collection['last_names'] = df_collection['last_names'].apply(
    lambda x: '; '.join(x) if isinstance(x, list) else x
)
df_collection['publish_year'] = df_collection['publish_time'].str.split('-').str[0]

In [21]:
def initialize_bm25(corpus: list[str], cord_uids:list[str], k1=1.5, b=0.75, method='lucene', stemmer=None):
    tokenized_corpus = bm25s.tokenize(corpus, stemmer=stemmer)
    bm25 = bm25s.BM25(corpus=cord_uids, k1=k1, b=b, method=method)
    bm25.index(tokenized_corpus)
    return bm25

In [22]:
def experiment_single_bm25(df_collection, df_query, k1=1.5, b=0.75, stemmer=None, k=10):
    corpus = df_collection.apply(
        lambda x: f"{x['title']} {x['abstract']} {x['last_names']} {x['journal']} {x['publish_year']}", axis=1
    ).tolist()
    bm25 = initialize_bm25(corpus, df_collection['cord_uid'].tolist(), k1, b, stemmer=stemmer)
    tokenized_queries = bm25s.tokenize(df_query['tweet_text'], stemmer=stemmer)
    doc_scores = bm25.retrieve(tokenized_queries, n_threads=-1, k=k)
    df_query['bm25_topk'] = doc_scores.documents.tolist()
    
    return df_query

In [23]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
        print(f"{k = }")
        in_topx = data["in_topx"] > 0
        print(f"Number of queries in top {k}: {in_topx.sum()}")
        print(f"Number of queries not in top {k}: {len(data) - in_topx.sum()}")
    return d_performance

In [24]:
def evaluate_experiment(df_query_train, df_query_dev, experiment_name, list_k=[1, 5, 10]):
    results = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, train: {results}")
    results = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, dev: {results}")
    return results

In [25]:

stemmer = None
df_query_train_single = experiment_single_bm25(df_collection, df_query_train, stemmer=stemmer, k=1000)
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=1000)
evaluate_experiment(df_query_train_single, df_query_dev_single, "Single BM25 with all features", list_k=[1, 5, 10, 100, 200, 500, 700, 1000, 2000, 5000])

                                                                            

k = 1
Number of queries in top 1: 7540
Number of queries not in top 1: 5313
k = 5
Number of queries in top 5: 9096
Number of queries not in top 5: 3757
k = 10
Number of queries in top 10: 9630
Number of queries not in top 10: 3223
k = 100
Number of queries in top 100: 11157
Number of queries not in top 100: 1696
k = 200
Number of queries in top 200: 11513
Number of queries not in top 200: 1340
k = 500
Number of queries in top 500: 11957
Number of queries not in top 500: 896
k = 700
Number of queries in top 700: 12090
Number of queries not in top 700: 763
k = 1000
Number of queries in top 1000: 12211
Number of queries not in top 1000: 642
k = 2000
Number of queries in top 2000: 12211
Number of queries not in top 2000: 642
k = 5000
Number of queries in top 5000: 12211
Number of queries not in top 5000: 642
Results for Single BM25 with all features, train: {1: np.float64(0.5866334707850307), 5: np.float64(0.634745455016987), 10: np.float64(0.6402931685394925), 100: np.float64(0.6449074311

{1: np.float64(0.5921428571428572),
 5: np.float64(0.6397619047619048),
 10: np.float64(0.6450986394557824),
 100: np.float64(0.6495240324689373),
 200: np.float64(0.6497567832248164),
 500: np.float64(0.6498352020281756),
 700: np.float64(0.6498638803520272),
 1000: np.float64(0.6498750349577517),
 2000: np.float64(0.6498750349577517),
 5000: np.float64(0.6498750349577517)}

We will use BM25 as a baseline ranking model. The experiment above shouws us that there is no improvement after the first 1k (doesnt matter whether we take top 1k or top 2k etc.) -> so for each query we will prefilter the documents with BM25 to the top 1k and then the reranker will take over.

In [26]:
def initialize_reranker(model_name, torch_dtype=torch.float16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True, torch_dtype=torch.float16
    )
    if torch.cuda.is_available():
        print("Using GPU")
        model = model.to('cuda')
    model.eval()
    return tokenizer, model

In [27]:
def rerank_with_alibaba(df_query, df_collection, tokenizer, model, k=10, max_length=512, batch_size=32):
    reranked_results = []
    reranked_scores = []
    reranked_docs = []

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk'] 

        pairs = [
            [
                query_text, 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]
        
        batch_scores = []
        batch_docs = []

        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            with torch.no_grad():
                inputs = tokenizer(batch_pairs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
                if torch.cuda.is_available():
                    inputs = {key: value.to('cuda') for key, value in inputs.items()}
                scores = model(**inputs, return_dict=True).logits.view(-1).float()

            batch_scores.extend(scores.tolist())
            batch_docs.extend(topk_docs[i:i+batch_size])


        reranked = sorted(zip(batch_docs, batch_scores), key=lambda x: x[1], reverse=True)
        reranked_results.append([doc_id for doc_id, _ in reranked[:k]])
        reranked_scores.append([score for _, score in reranked])
        reranked_docs.append([doc_id for doc_id, _ in reranked])

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_docs
    df_query['reranked_scores'] = reranked_scores
    return df_query

In [28]:
def rerank_mxbai(df_query, df_collection, model_name, k=10, batch_size=1):
    reranked_results = []    
    reranked_scores = []

    model = MxbaiRerankV2(model_name, torch_dtype=torch.float16)

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk']  # Get top-k BM25 results for the query

        documents = [
            [
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]

        results = model.rank(query_text, documents, return_documents=True, top_k=k, batch_size=batch_size)
        reranked = [topk_docs[result.index] for result in results]
        reranked_score = [result.score for result in results]

        reranked_results.append(reranked)
        reranked_scores.append(reranked_score)

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_results
    df_query['reranked_scores'] = reranked_scores
    
    return df_query

In [29]:
def evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10]):
    return get_performance_mrr(df_query, col_gold, col_pred, list_k)

## Experiments

In [None]:
df_query_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-reranker-modernbert-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Exception ignored in:                                                       <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fd9e42dfa10>>
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
                                                                       

Using GPU


Reranking:   0%|          | 12/14253 [00:06<2:06:33,  1.88it/s]

In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=50)
model_name = "Alibaba-NLP/gte-multilingual-reranker-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_multilingual.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:49:21<00:00,  2.17it/s]


k = 1
Number of queries in top 1: 9026
Number of queries not in top 1: 5227
k = 5
Number of queries in top 5: 10949
Number of queries not in top 5: 3304
k = 10
Number of queries in top 10: 11361
Number of queries not in top 10: 2892
Evaluation results: {1: np.float64(0.6332701887321968), 5: np.float64(0.6880668397296476), 10: np.float64(0.6920471490825101)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=200)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-TinyBERT-L2-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_tinyBERT.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[hg3xpej0, 59up4v56, 82y56t7d, 86xwnpde, 8t2ti...","[1.2587890625, 0.71435546875, 0.4404296875, 0....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[r58aohnu, kiq6xb6k, s2vckt2w, icgsbelo, eay6q...","[3.333984375, 0.349853515625, 0.17138671875, 0...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[sts48u9i, o47v5vgw, a7frertc, o877uul1, u5nxm...","[0.309814453125, -0.06854248046875, -0.1201171...",1.0
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, pq3n1...","[0.83642578125, 0.22509765625, 0.1030883789062...",1.0
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[0.22314453125, 0.08758544921875, -0.127197265...",1.0


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:21<00:00,  2.63it/s]


k = 1
Number of queries in top 1: 7438
Number of queries not in top 1: 6815
k = 5
Number of queries in top 5: 9521
Number of queries not in top 5: 4732
k = 10
Number of queries in top 10: 10215
Number of queries not in top 10: 4038
Evaluation results: {1: np.float64(0.5218550480600576), 5: np.float64(0.5785320049580205), 10: np.float64(0.5850188876972712)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=100)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,reranked_topk,reranked_docs,reranked_scores,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[nksd3wuw, es8l29ub, atji1xge, 82y56t7d, sqxdw...","[7.078125, 6.046875, 5.74609375, 5.4140625, 5....",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[r58aohnu, 6zfpcm4j, wk61uyrt, xsqgrd5l, icgsb...","[6.75390625, 1.7626953125, 1.494140625, 1.3847...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[u5nxm9tu, qkg8fwbp, sts48u9i, 4aps0kvp, ujq9m...","[0.5537109375, -0.328857421875, -0.48095703125...",0.333333
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, wbw7g...","[-0.0164794921875, -1.2412109375, -1.251953125...",0.333333
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[4.51171875, 4.36328125, 4.16015625, 2.578125,...",0.5


Using GPU


Reranking: 100%|██████████| 14253/14253 [1:30:00<00:00,  2.64it/s]


k = 1
Number of queries in top 1: 7613
Number of queries not in top 1: 6640
k = 5
Number of queries in top 5: 9648
Number of queries not in top 5: 4605
k = 10
Number of queries in top 10: 10355
Number of queries not in top 10: 3898
Evaluation results: {1: np.float64(0.5341331649477303), 5: np.float64(0.5887462288640988), 10: np.float64(0.5954533659858855)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=40)
model_name = "cross-encoder/ms-marco-electra-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet("data/reranked_results_electra.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 14253/14253 [1:20:58<00:00,  2.93it/s]


k = 1
Number of queries in top 1: 7032
Number of queries not in top 1: 7221
k = 5
Number of queries in top 5: 9334
Number of queries not in top 5: 4919
k = 10
Number of queries in top 10: 10228
Number of queries not in top 10: 4025
Evaluation results: {1: np.float64(0.49336981688065673), 5: np.float64(0.5558163193713604), 10: np.float64(0.5641402422659)}


In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_combined, stemmer=stemmer, k=25)
df_query = rerank_mxbai(df_query_dev_single, df_collection, "mixedbread-ai/mxbai-rerank-base-v2", batch_size=1)
df_query.to_parquet("data/reranked_results_mxbai.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Reranking:   0%|          | 0/14253 [00:00<?, ?it/s]                        You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Reranking: 100%|██████████| 14253/14253 [4:14:34<00:00,  1.07s/it]  


k = 1
Number of queries in top 1: 9784
Number of queries not in top 1: 4469
k = 5
Number of queries in top 5: 11050
Number of queries not in top 5: 3203
k = 10
Number of queries in top 10: 11274
Number of queries not in top 10: 2979
Evaluation results: {1: np.float64(0.6864519750228022), 5: np.float64(0.7231974555064431), 10: np.float64(0.7253506362904384)}


## Finetune

In [30]:
from datasets import Dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss, MultipleNegativesRankingLoss, LambdaLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer


In [31]:
df_collection['answer'] = df_collection.apply(
    lambda row: f"{row['title']} {row['abstract']} {row['last_names']} {row['journal']}", axis=1
)
df_collection.head(3)   

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet,last_names,publish_year,answer
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600,van der Sande; Teunis; Sabel,2008,Professional and Home-Made Face Masks Reduce E...
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800,Li; Blakeley; Smith?,2011,The Failure of R (0) The basic reproductive ra...
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000,Singh; Sharma; Patel,2012,Pulmonary sequelae in a patient recovered from...


In [None]:
df_train = df_query_train.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_dev = df_query_dev.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_full = pd.concat([df_train, df_dev], ignore_index=True)
df_full.head(3)

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,answer
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,"[htlvpvz5, h7hj64q5, 4aps0kvp, 5tkyir3r, 32z7b...",1.0,Oral Management in Rehabilitation Medicine: Or...
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,"[maj8r6ti, bjvg2ivr, 7tto4hr7, 2cwvga0k, 46je8...",0.003145,Variation in racial/ethnic disparities in COVI...
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"[jtwb17u8, veeavho5, jbpmbm9m, 8hkxbxz9, 32v44...",1.0,Effect of non-pharmaceutical interventions for...


In [None]:
train_dataset = Dataset.from_dict({
    "query": df_train['tweet_text'].tolist(),
    "document": df_train['answer'].tolist(),
})

dev_dataset = Dataset.from_dict({
    "query": df_dev['tweet_text'].tolist(),
    "document": df_dev['answer'].tolist(),
})
full_dataset = Dataset.from_dict({
    "query": df_full['tweet_text'].tolist(),
    "document": df_full['answer'].tolist(),
})

In [32]:
def get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives, num_hard_negatives_eval, train_dataset, dev_dataset, df_full, batch_size=4096):

    embedding_model = SentenceTransformer(embedding_model_hard_negatives, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    # hard_train_dataset = mine_hard_negatives(
    #     train_dataset,
    #     embedding_model,
    #     num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
    #     range_min=3,  # Skip the x most similar samples
    #     range_max=100,  # Consider only the x most similar samples
    #     max_score=0.95,  # Only consider samples with a similarity score of at most x
    #     margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
    #     sampling_strategy="top",  # Randomly sample negatives from the range
    #     batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
    #     output_format="labeled-pair",  # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
    # )

    # hard_eval_dataset = mine_hard_negatives(
    #     dev_dataset,
    #     embedding_model,
    #     corpus=df_full["answer"],  # Use the full dataset as the corpus
    #     num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
    #     batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
    #     output_format="n-tuple",  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
    #     include_positives=True,  # Key: Include the positive answer in the list of negatives
    #     range_min=3,  # Skip the x most similar samples
    #     range_max=100,  # Consider only the x most similar samples
    #     max_score=0.95,  # Only consider samples with a similarity score of at most x
    #     margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
    #     sampling_strategy="top",  # Randomly sample negatives from the range
    # )

    hard_train_dataset = mine_hard_negatives(
        train_dataset,
        embedding_model,
        num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
        range_min=3,  # Skip the x most similar samples
        range_max=100,  # Consider only the x most similar samples
        max_score=0.95,  # Only consider samples with a similarity score of at most x
        margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
        sampling_strategy="top",  # Randomly sample negatives from the range
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        output_format="labeled-pair",  # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
    )

    print("======================")
    hard_eval_dataset = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        output_format="labeled-pair",  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
    )

    print("======================")
    hard_eval_dataset_evaluator = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,  # Use a batch size of 4096 for the embedding model
        include_positives=True,  # Key: Include the positive answer in the list of negatives
    )

    return hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator


class ExperimentRunner:
    def __init__(self, model_to_finetune, train_batch_size, num_epochs, max_length, experiment_name):
        self.experiment_name = experiment_name
        self.model_to_finetune = model_to_finetune
        self.train_batch_size = train_batch_size
        self.num_epochs = num_epochs
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def run_experiment(self, hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, trust_remote_code=False, eval_steps=None):

        if eval_steps is None:
            eval_steps = int(20000/self.train_batch_size)
        
        print(f"Eval steps: {eval_steps}")
        model = CrossEncoder(self.model_to_finetune, max_length=self.max_length, device=self.device, trust_remote_code=trust_remote_code)

        args = CrossEncoderTrainingArguments(
            # Required parameter:
            output_dir=f"model/{self.model_to_finetune.split("/")[-1]}{self.experiment_name}",
            # Optional training parameters:
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.train_batch_size,
            per_device_eval_batch_size=self.train_batch_size,
            learning_rate=2e-5,
            warmup_ratio=0.1,
            fp16=True,  
            bf16=False, 
            batch_sampler=BatchSamplers.NO_DUPLICATES, 
            eval_strategy="steps",
            eval_steps=eval_steps,
            save_strategy="steps",
            save_steps=eval_steps,
            save_total_limit=10,
            logging_steps=eval_steps,
            run_name=self.model_to_finetune.split("/")[-1],
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )
        
        reranking_evaluator = CrossEncoderRerankingEvaluator(
            samples=[
                {
                    "query": sample["query"],
                    "positive": [sample["document"]],
                    "documents": [sample[column_name] for column_name in hard_eval_dataset_evaluator.column_names if 'negative' in column_name],
                }
                for sample in hard_eval_dataset_evaluator
            ],
            batch_size=self.train_batch_size,
            name="dev_set",
            show_progress_bar=True,
        )

        loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives))
        # loss = MultipleNegativesRankingLoss(model=model, num_negatives=num_hard_negatives)
        # loss = LambdaLoss(model=model, k=5)

        trainer = CrossEncoderTrainer(
            model=model,
            args=args,
            train_dataset=hard_train_dataset,
            eval_dataset=hard_eval_dataset,
            loss=loss,
            evaluator=reranking_evaluator,
    
        )

        # Train the model
        trainer.train()

        model.save(f"models/{self.model_to_finetune.split('/')[-1]}-finetuned{self.experiment_name}")

In [33]:
num_epochs = 2
# embedding_model_hard_negatives = "sentence-transformers/static-retrieval-mrl-en-v1"
# embedding_model_hard_negatives = "sentence-transformers/all-MiniLM-L6-v2"
# embedding_model_hard_negatives = "sentence-transformers/all-MiniLM-L12-v2"
embedding_model_hard_negatives = "BAAI/bge-small-en-v1.5"


experiment_name = f"_{embedding_model_hard_negatives.split('/')[-1]}_epochs{num_epochs}_crossentropyloss_stricter embedding"

In [23]:
num_hard_negatives = 5
hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives=num_hard_negatives, num_hard_negatives_eval=5, train_dataset=train_dataset, dev_dataset=dev_dataset, df_full=df_full, batch_size=64)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:25<00:00,  4.32it/s]
Batches: 100%|██████████| 201/201 [00:05<00:00, 33.62it/s]


Metric       Positive       Negative     Difference
Count          12,853         42,838               
Mean           0.8195         0.7795         0.0796
Median         0.8329         0.7911         0.0650
Std            0.0826         0.0525         0.0355
Min            0.3809         0.5041        -0.0251
25%            0.7696         0.7488         0.0544
50%            0.8329         0.7911         0.0650
75%            0.8834         0.8169         0.0924
Max            0.9881         0.8966         0.2992
Skipped 561,618 potential negatives (42.46%) due to the absolute_margin of 0.05.
Could not find enough negatives for 21427 samples (33.34%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:28<00:00,  4.32it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 34.62it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8033         0.0186
Median         0.8323         0.8101         0.0168
Std            0.0783         0.0505         0.0640
Min            0.4977         0.5529        -0.1856
25%            0.7717         0.7739        -0.0220
50%            0.8323         0.8101         0.0168
75%            0.8846         0.8388         0.0593
Max            0.9681         0.9451         0.2385
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:28<00:00,  4.31it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 34.69it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8117         0.0102
Median         0.8323         0.8156         0.0000
Std            0.0783         0.0543         0.0593
Min            0.4977         0.5593        -0.1856
25%            0.7717         0.7790        -0.0220
50%            0.8323         0.8156         0.0000
75%            0.8846         0.8466         0.0443
Max            0.9681         0.9681         0.2377


In [24]:
train_batch_size = 128
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-TinyBERT-L2-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives)

Eval steps: 156


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
156,1.2389,0.797737,0.762452,0.762452,0.820094,0.568988,0.568988,0.595057
312,0.5475,0.650652,0.768679,0.768679,0.824927,0.568988,0.568988,0.595057
468,0.4547,0.594442,0.768143,0.768143,0.824545,0.568988,0.568988,0.595057
624,0.4424,0.636502,0.768929,0.768929,0.82524,0.568988,0.568988,0.595057
780,0.4406,0.589987,0.770226,0.770226,0.826212,0.568988,0.568988,0.595057


                                                                        

In [25]:
train_batch_size = 4
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L12-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives)

Eval steps: 5000


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
5000,0.8341,0.595259,0.822738,0.822738,0.865859,0.568988,0.568988,0.595057
10000,0.6113,0.613535,0.839667,0.839667,0.878715,0.568988,0.568988,0.595057
15000,0.5032,0.673446,0.826536,0.826536,0.869118,0.568988,0.568988,0.595057
20000,0.3133,0.654007,0.838845,0.838845,0.878276,0.568988,0.568988,0.595057
25000,0.3222,0.663786,0.80681,0.80681,0.854552,0.568988,0.568988,0.595057


                                                                       

In [26]:
train_batch_size = 8
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L6-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives)

Eval steps: 2500


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
2500,0.7176,0.572058,0.816583,0.816583,0.861199,0.568988,0.568988,0.595057
5000,0.5353,0.584391,0.828524,0.828524,0.870347,0.568988,0.568988,0.595057
7500,0.4584,0.649886,0.825321,0.825321,0.868094,0.568988,0.568988,0.595057
10000,0.3458,0.628598,0.832,0.832,0.872891,0.568988,0.568988,0.595057
12500,0.3463,0.602163,0.835476,0.835476,0.875564,0.568988,0.568988,0.595057


                                                                       

In [27]:

train_batch_size = 2
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-electra-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives)

Eval steps: 10000


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
10000,1.1264,0.896045,0.82844,0.828798,0.870594,0.568988,0.568988,0.595057
20000,0.8893,1.002589,0.806786,0.806786,0.854178,0.568988,0.568988,0.595057
30000,0.6362,0.830475,0.770869,0.77444,0.830385,0.568988,0.568988,0.595057
40000,0.4254,0.968997,0.824286,0.825833,0.868394,0.568988,0.568988,0.595057
50000,0.4046,0.935117,0.758476,0.76556,0.823031,0.568988,0.568988,0.595057


                                                                       

In [28]:
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-reranker-modernbert-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives)

Eval steps: 20000


Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
20000,0.8532,0.792676,0.831083,0.831083,0.872434,0.568988,0.568988,0.595057
40000,0.762,0.719055,0.853679,0.853679,0.889632,0.568988,0.568988,0.595057
60000,0.5456,0.808621,0.852714,0.854143,0.889363,0.568988,0.568988,0.595057
80000,0.2311,0.742888,0.851798,0.851798,0.888238,0.568988,0.568988,0.595057
100000,0.1905,0.733588,0.859536,0.859893,0.894252,0.568988,0.568988,0.595057


                                                                       

In [29]:
num_hard_negatives = 3
num_hard_negatives_eval = 3
hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives=num_hard_negatives, num_hard_negatives_eval=num_hard_negatives_eval, train_dataset=train_dataset, dev_dataset=dev_dataset, df_full=df_full, batch_size=64)



The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:24<00:00,  4.39it/s]
Batches: 100%|██████████| 201/201 [00:05<00:00, 33.99it/s]


Metric       Positive       Negative     Difference
Count          12,853         25,740               
Mean           0.8195         0.7811         0.0779
Median         0.8329         0.7925         0.0631
Std            0.0826         0.0525         0.0346
Min            0.3809         0.5051        -0.0251
25%            0.7696         0.7506         0.0537
50%            0.8329         0.7925         0.0631
75%            0.8834         0.8186         0.0897
Max            0.9881         0.8966         0.2938
Skipped 561,618 potential negatives (42.46%) due to the absolute_margin of 0.05.
Could not find enough negatives for 12819 samples (33.25%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 5 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:27<00:00,  4.38it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 34.80it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          4,200               
Mean           0.8219         0.8090         0.0129
Median         0.8323         0.8151         0.0112
Std            0.0783         0.0501         0.0631
Min            0.4977         0.5685        -0.1856
25%            0.7717         0.7798        -0.0264
50%            0.8323         0.8151         0.0113
75%            0.8846         0.8437         0.0524
Max            0.9681         0.9451         0.2224
Setting range_max to 5 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:27<00:00,  4.33it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 35.06it/s]


Metric       Positive       Negative     Difference
Count           1,400          4,200               
Mean           0.8219         0.8213         0.0006
Median         0.8323         0.8252        -0.0000
Std            0.0783         0.0552         0.0550
Min            0.4977         0.5707        -0.1856
25%            0.7717         0.7878        -0.0264
50%            0.8323         0.8252        -0.0000
75%            0.8846         0.8577         0.0252
Max            0.9681         0.9681         0.2158


In [None]:
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-multilingual-reranker-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, trust_remote_code=True)

Eval steps: 20000


Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
20000,0.7319,0.803203,0.817083,0.819583,0.872392,0.559881,0.559881,0.578612
40000,0.6213,0.703173,0.792976,0.794405,0.852109,0.559881,0.559881,0.578612
60000,0.412,0.794674,0.838869,0.838869,0.879841,0.559881,0.559881,0.578612


                                                                       

Note that the metrics in the following cells are overly optimistic, since we evaluate on the whole data, which includes the training data. We do this on the whole data so that we have enough data for the ensemble learning part.

In [34]:
df_query_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-reranker-modernbert-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:55<00:00,  3.94it/s]

k = 1
Number of queries in top 1: 940
Number of queries not in top 1: 460
k = 5
Number of queries in top 5: 1088
Number of queries not in top 5: 312
k = 10
Number of queries in top 10: 1117
Number of queries not in top 10: 283
Evaluation results: {1: np.float64(0.6714285714285714), 5: np.float64(0.715642857142857), 10: np.float64(0.7184308390022676)}





In [None]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-multilingual-reranker-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_multilingual_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:33<00:00,  4.20it/s]

k = 1
Number of queries in top 1: 905
Number of queries not in top 1: 495
k = 5
Number of queries in top 5: 1065
Number of queries not in top 5: 335
k = 10
Number of queries in top 10: 1107
Number of queries not in top 10: 293
Evaluation results: {1: np.float64(0.6464285714285715), 5: np.float64(0.6916547619047618), 10: np.float64(0.695531179138322)}





In [35]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L12-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM12_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [02:57<00:00,  7.87it/s]

k = 1
Number of queries in top 1: 916
Number of queries not in top 1: 484
k = 5
Number of queries in top 5: 1073
Number of queries not in top 5: 327
k = 10
Number of queries in top 10: 1105
Number of queries not in top 10: 295
Evaluation results: {1: np.float64(0.6542857142857142), 5: np.float64(0.6985357142857144), 10: np.float64(0.7015266439909298)}





In [36]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L6-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM6_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [02:32<00:00,  9.15it/s]

k = 1
Number of queries in top 1: 902
Number of queries not in top 1: 498
k = 5
Number of queries in top 5: 1068
Number of queries not in top 5: 332
k = 10
Number of queries in top 10: 1102
Number of queries not in top 10: 298
Evaluation results: {1: np.float64(0.6442857142857142), 5: np.float64(0.6916785714285715), 10: np.float64(0.6949016439909298)}





In [37]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-TinyBERT-L2-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_tinyBERT_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [02:06<00:00, 11.08it/s]

k = 1
Number of queries in top 1: 789
Number of queries not in top 1: 611
k = 5
Number of queries in top 5: 1003
Number of queries not in top 5: 397
k = 10
Number of queries in top 10: 1070
Number of queries not in top 10: 330
Evaluation results: {1: np.float64(0.5635714285714286), 5: np.float64(0.6248452380952381), 10: np.float64(0.6315274943310658)}





In [38]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-electra-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet(f"data/reranked_results_electra_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:22<00:00,  4.34it/s]

k = 1
Number of queries in top 1: 734
Number of queries not in top 1: 666
k = 5
Number of queries in top 5: 1055
Number of queries not in top 5: 345
k = 10
Number of queries in top 10: 1105
Number of queries not in top 10: 295
Evaluation results: {1: np.float64(0.5242857142857142), 5: np.float64(0.6165119047619046), 10: np.float64(0.6216179138321996)}



