In [1]:
import pandas as pd
import bm25s
from pathlib import Path
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from mxbai_rerank import MxbaiRerankV2

  from .autonotebook import tqdm as notebook_tqdm


# Neural Reranking
This notebooks houses our finetuning and evaluation code for the neural reranking approach. A diverse set of pretrained reranker models were tested, including Alibaba’s gte-reranker-modernbert-base, multilingual versions, and others like Electra and MiniLM. This diversity allowed us to investigate which architectures and training schemes work best for our target dataset. We also investigated how an ensemble of these models perform in reranking_ensemble.ipynb.

## Data loading

In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [3]:
data_path = Path('data/')

collection_data_path = data_path / 'subtask4b_collection_data.pkl' 
query_dev_data_path = data_path / 'subtask4b_query_tweets_dev.tsv'
query_train_data_path = data_path / 'subtask4b_query_tweets_train.tsv'

In [4]:
df_collection = pd.read_pickle(collection_data_path)
df_collection.head(5)

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000
993,ycxyn2a2,PMC,What was the primary mode of smallpox transmis...,10.3389/fcimb.2012.00150,PMC3509329,23226686,cc-by,The mode of infection transmission has profoun...,2012-11-29,"Milton, Donald K.",Front Cell Infect Microbiol,,,,ycxyn2a2,2012-11-29,1354147200
1053,zxe95qy9,PMC,"Lessons from the History of Quarantine, from P...",10.3201/eid1902.120312,PMC3559034,23343512,no-cc,"In the new millennium, the centuries-old strat...",2013-02-03,"Tognotti, Eugenia",Emerg Infect Dis,,,,zxe95qy9,2013-02-03,1359849600


In [5]:
df_query_dev = pd.read_csv(query_dev_data_path, sep='\t')
df_query_train = pd.read_csv(query_train_data_path, sep='\t')
display(df_query_dev)
display(df_query_train)

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
1395,14193,Residents at high risk of covid-19: effectiven...,0gn3b98n
1396,14196,"61% of teenagers hospitalized for covid were ""...",25bdifv6
1397,14203,"""fresh evidence backing melatonin against covi...",qn6wawxk
1398,14233,"the vaccine doesn't halt the spread, it is pro...",3u3i5myh


Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69
...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


In [6]:
df_query_test = pd.read_csv(data_path / 'subtask4b_query_tweets_test.tsv', sep='\t')

In [7]:
df_query_combined = pd.concat([df_query_dev, df_query_train], ignore_index=True)
df_query_combined

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy
...,...,...,...
14248,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
14249,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
14250,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
14251,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


Based on the basic_data_analysis.ipynb author last names and publication year might also carry important information so we extract this information from the raw data.

In [8]:
df_collection['last_names'] = df_collection['authors'].str.split(';').apply(
    lambda authors: [author.split(',')[0].strip() for author in authors] if isinstance(authors, list) else authors
)
df_collection['last_names'] = df_collection['last_names'].apply(
    lambda x: '; '.join(x) if isinstance(x, list) else x
)
df_collection['publish_year'] = df_collection['publish_time'].str.split('-').str[0]

We investigate a neural reranking approach. This means we need a solid and very efficient baseline that we can then rerank. We used the best performing BM25 model from the traditional_approach.ipynb for this case.

In [9]:
def initialize_bm25(corpus: list[str], cord_uids:list[str], k1=1.5, b=0.75, method='lucene', stemmer=None):
    tokenized_corpus = bm25s.tokenize(corpus, stemmer=stemmer)
    bm25 = bm25s.BM25(corpus=cord_uids, k1=k1, b=b, method=method)
    bm25.index(tokenized_corpus)
    return bm25

In [10]:
def experiment_single_bm25(df_collection, df_query, k1=1.5, b=0.75, stemmer=None, k=10):
    corpus = df_collection.apply(
        lambda x: f"{x['title']} {x['abstract']} {x['last_names']} {x['journal']} {x['publish_year']}", axis=1
    ).tolist()
    bm25 = initialize_bm25(corpus, df_collection['cord_uid'].tolist(), k1, b, stemmer=stemmer)
    tokenized_queries = bm25s.tokenize(df_query['tweet_text'], stemmer=stemmer)
    doc_scores = bm25.retrieve(tokenized_queries, n_threads=-1, k=k)
    df_query['bm25_topk'] = doc_scores.documents.tolist()
    
    return df_query

In [11]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
        print(f"{k = }")
        in_topx = data["in_topx"] > 0
        print(f"Number of queries in top {k}: {in_topx.sum()}")
        print(f"Number of queries not in top {k}: {len(data) - in_topx.sum()}")
    return d_performance

In [12]:
def evaluate_experiment(df_query_train, df_query_dev, experiment_name, list_k=[1, 5, 10]):
    results = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, train: {results}")
    results = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk', list_k)
    print(f"Results for {experiment_name}, dev: {results}")
    return results

We also wanted to know how many documents are necessary from BM25 to get the best possible reranking results. This is why we investigated how many documents the reranker should rerank for the final submission.

In [None]:
stemmer = None
df_query_train_single = experiment_single_bm25(df_collection, df_query_train, stemmer=stemmer, k=1000)
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=1000)
evaluate_experiment(df_query_train_single, df_query_dev_single, "Single BM25 with all features", list_k=[1, 5, 10, 100, 200, 500, 700, 1000, 2000, 5000])

                                                                            

k = 1
Number of queries in top 1: 7542
Number of queries not in top 1: 5311
k = 5
Number of queries in top 5: 9094
Number of queries not in top 5: 3759
k = 10
Number of queries in top 10: 9630
Number of queries not in top 10: 3223
k = 100
Number of queries in top 100: 11156
Number of queries not in top 100: 1697
k = 200
Number of queries in top 200: 11512
Number of queries not in top 200: 1341
k = 500
Number of queries in top 500: 11956
Number of queries not in top 500: 897
k = 700
Number of queries in top 700: 12089
Number of queries not in top 700: 764
k = 1000
Number of queries in top 1000: 12210
Number of queries not in top 1000: 643
k = 2000
Number of queries in top 2000: 12210
Number of queries not in top 2000: 643
k = 5000
Number of queries in top 5000: 12210
Number of queries not in top 5000: 643
Results for Single BM25 with all features, train: {1: 0.5867890764801992, 5: 0.6347999170102959, 10: 0.6403735648153294, 100: 0.6449850219400036, 200: 0.6451818481542875, 500: 0.645294

{1: 0.5921428571428572,
 5: 0.6397619047619048,
 10: 0.6450986394557824,
 100: 0.6495240324689373,
 200: 0.6497567832248164,
 500: 0.6498352020281756,
 700: 0.6498638803520272,
 1000: 0.6498750349577517,
 2000: 0.6498750349577517,
 5000: 0.6498750349577517}

The experiment above shows us that there is no improvement after the first 1k (doesnt matter whether we take top 1k or top 2k etc.) -> so for each query we will prefilter the documents with BM25 to the top 1k and then the reranker will take over. As reranking 1k documents is very expensive for the more computationlly intensive methods, we only do this for the final submission.

## Reranking Inference code

We use the transformers library to do the inference.

In [14]:
def initialize_reranker(model_name, torch_dtype=torch.float16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True, torch_dtype=torch.float16
    )
    if torch.cuda.is_available():
        print("Using GPU")
        model = model.to('cuda')
    model.eval()
    return tokenizer, model

We feed (query, document) pairs into the model. For each pair we compute the score and use this to rerank the documents based on the score. Our documents are represented by the title, abstract, journal and author last names.

In [15]:
def rerank_with_alibaba(df_query, df_collection, tokenizer, model, k=10, max_length=512, batch_size=32):
    reranked_results = []
    reranked_scores = []
    reranked_docs = []

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk'] 

        pairs = [
            [
                query_text, 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]
        
        batch_scores = []
        batch_docs = []

        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            with torch.no_grad():
                inputs = tokenizer(batch_pairs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
                if torch.cuda.is_available():
                    inputs = {key: value.to('cuda') for key, value in inputs.items()}
                scores = model(**inputs, return_dict=True).logits.view(-1).float()

            batch_scores.extend(scores.tolist())
            batch_docs.extend(topk_docs[i:i+batch_size])


        reranked = sorted(zip(batch_docs, batch_scores), key=lambda x: x[1], reverse=True)
        reranked_results.append([doc_id for doc_id, _ in reranked[:k]])
        reranked_scores.append([score for _, score in reranked])
        reranked_docs.append([doc_id for doc_id, _ in reranked])

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_docs
    df_query['reranked_scores'] = reranked_scores
    return df_query

The Mxbai model comes with its own library and API. The code does the same as the one above, but its rewritten according to the API. Unfortunately the library offers no entry point for finetuning the model, so we can only run inference with it (relevant for later).

In [16]:
def rerank_mxbai(df_query, df_collection, model_name, k=10, batch_size=1):
    reranked_results = []    
    reranked_scores = []

    model = MxbaiRerankV2(model_name, torch_dtype=torch.float16)

    for _, row in tqdm(df_query.iterrows(), total=len(df_query), desc="Reranking"):
        query_text = row['tweet_text']
        topk_docs = row['bm25_topk']  # Get top-k BM25 results for the query

        documents = [
            [
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'title'].values[0] + " " + 
                df_collection.loc[df_collection['cord_uid'] == doc_id, 'abstract'].values[0] + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'journal'].values[0]) + " " +
                str(df_collection.loc[df_collection['cord_uid'] == doc_id, 'last_names'].values[0]) 
            ]
            for doc_id in topk_docs
        ]

        results = model.rank(query_text, documents, return_documents=True, top_k=k, batch_size=batch_size)
        reranked = [topk_docs[result.index] for result in results]
        reranked_score = [result.score for result in results]

        reranked_results.append(reranked)
        reranked_scores.append(reranked_score)

    df_query['reranked_topk'] = reranked_results
    df_query['reranked_docs'] = reranked_results
    df_query['reranked_scores'] = reranked_scores
    
    return df_query

In [17]:
def evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10]):
    return get_performance_mrr(df_query, col_gold, col_pred, list_k)

## Experiments with the Baseline
To see whether our finetuning works we need a baseline first. For this reason we evaluate the performance of the pretrained models first (before doing any finetuning).
We chose the following models for our experiments:

- Alibaba-NLP/gte-reranker-modernbert-base (referred to as alibaba): This model is based on ModernBERT, a variant of BERT that improves performance through architectural tweaks and training optimizations. It combines a text embedding component with a reranking head, offering state-of-the-art results for its parameter size and has been shown to perform well on retrieval benchmarks.
- Alibaba-NLP/gte-multilingual-reranker-base, (alibaba multilingual): This model is trained on multilingual data and designed to handle queries and documents in multiple languages. We included it mainly to compare its performance against the English-only alibaba model and to check whether our dataset might contain any non-English content, as the dataset documentation does not specify the language.
- cross-encoder/ms-marco-TinyBERT-L2-v2 (tinyBERT): A compact reranking model based on TinyBERT, which is a distilled version of BERT designed to keep most of BERT’s accuracy while significantly reducing model size and inference time. It was trained on the MS MARCO Passage dataset. We mainly used this model during development and coding because it’s efficient and fast to train, yet still delivers decent results—making it ideal for quick experiments and debugging.
- cross-encoder/ms-marco-MiniLM-L6-v2 and cross-encoder/ms-marco-MiniLM-L12-v2 (miniLMx): These models use the MiniLM architecture, which employs deep self- attention distillation to deliver a lightweight transformer model. With 6 and 12 layers respectively, they offer a good trade-off between speed and accuracy. Both are pretrained and fine-tuned on MS MARCO.
- cross-encoder/ms-marco-electra-base (electra): This model uses ELECTRA as its backbone, which replaces traditional masked language modeling with a more sample-efficient pretraining method based on a discriminator network. ELECTRA models often outperform BERT in downstream tasks while being faster to train.
- mixedbread-ai/mxbai-rerank-base-v2 (mxbai): This was the best-performing model we could run on our hardware, but unfortunately, it doesn’t support fine-tuning through its API. It uses a sophisticated three-stage reinforcement learning approach inspired by DeepSeek: first, it learns binary relevance classification; next, it learns to capture semantic relationships via contrastive learning; and finally, it learns to optimize the ranking order.This multi-step training makes it especially effective for retrieval tasks.

In [18]:
df_query_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = "Alibaba-NLP/gte-reranker-modernbert-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:42<00:00,  3.48it/s]

k = 1
Number of queries in top 1: 945
Number of queries not in top 1: 455
k = 5
Number of queries in top 5: 1092
Number of queries not in top 5: 308
k = 10
Number of queries in top 10: 1120
Number of queries not in top 10: 280
Evaluation results: {1: 0.675, 5: 0.7175714285714285, 10: 0.7203741496598639}





In [20]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = "Alibaba-NLP/gte-multilingual-reranker-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet('data/reranked_results_alibaba_multilingual.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:05<00:00,  3.83it/s]

k = 1
Number of queries in top 1: 898
Number of queries not in top 1: 502
k = 5
Number of queries in top 5: 1068
Number of queries not in top 5: 332
k = 10
Number of queries in top 10: 1100
Number of queries not in top 10: 300
Evaluation results: {1: 0.6414285714285715, 5: 0.6917500000000001, 10: 0.6949143990929705}





In [21]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-TinyBERT-L2-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_tinyBERT.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[59up4v56, 82y56t7d, 8t2tic9n, n2kn7o67, 6mfd3...","[59up4v56, 82y56t7d, 8t2tic9n, n2kn7o67, 6mfd3...","[0.71435546875, 0.4404296875, 0.345458984375, ..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, kiq6xb6k, s2vckt2w, eay6qfhz, 70ine...","[r58aohnu, kiq6xb6k, s2vckt2w, eay6qfhz, 70ine...","[3.333984375, 0.349853515625, 0.17138671875, 0..."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, o47v5vgw, o877uul1, tz2shoso, gruir...","[sts48u9i, o47v5vgw, o877uul1, tz2shoso, gruir...","[0.309814453125, -0.06854248046875, -0.1237792..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",1.0,"[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, tx8yp...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, tx8yp...","[0.83642578125, 0.22509765625, 0.1030883789062..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",1.0,"[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[ybwwmyqy, rs3umc1x, ouvq2wpq, 3l6ipiwk, ierqf...","[0.22314453125, 0.08758544921875, -0.127197265..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [01:10<00:00, 19.94it/s]

k = 1
Number of queries in top 1: 784
Number of queries not in top 1: 616
k = 5
Number of queries in top 5: 993
Number of queries not in top 5: 407
k = 10
Number of queries in top 10: 1057
Number of queries not in top 10: 343
Evaluation results: {1: 0.56, 5: 0.6177857142857143, 10: 0.6241026077097507}





In [22]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L6-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM6.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[atji1xge, 82y56t7d, trrg1mnw, o4vvlmr4, 8t2ti...","[atji1xge, 82y56t7d, trrg1mnw, o4vvlmr4, 8t2ti...","[5.74609375, 5.4140625, 5.33203125, 5.13671875..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, 6zfpcm4j, tu1vevx9, yrowv62k, kiq6x...","[r58aohnu, 6zfpcm4j, tu1vevx9, yrowv62k, kiq6x...","[6.75390625, 1.7626953125, 1.083984375, 0.7802..."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, o47v5vgw, gruir7aw, 3xw4qjoy, o877u...","[sts48u9i, o47v5vgw, gruir7aw, 3xw4qjoy, o877u...","[-0.48095703125, -1.2470703125, -1.6162109375,..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",0.333333,"[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, sv48g...","[kdegnr6i, bn22k0p3, 3sr2exq9, k0f4cwig, sv48g...","[-0.0164794921875, -1.2412109375, -1.251953125..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",0.5,"[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[vabb2f26, ybwwmyqy, 3l6ipiwk, ouvq2wpq, buswb...","[4.51171875, 4.36328125, 4.16015625, 2.578125,..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [01:44<00:00, 13.43it/s]

k = 1
Number of queries in top 1: 807
Number of queries not in top 1: 593
k = 5
Number of queries in top 5: 1008
Number of queries not in top 5: 392
k = 10
Number of queries in top 10: 1068
Number of queries not in top 10: 332
Evaluation results: {1: 0.5764285714285714, 5: 0.630452380952381, 10: 0.6363630952380952}





In [25]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
display(df_query_dev_single.head(5))
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet("data/reranked_results_miniLM12.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,reranked_topk,reranked_docs,reranked_scores
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[atji1xge, mb18fj8a, 66g5lpm6, 59up4v56, gatxu...",0.0,"[8t2tic9n, 6mfd3n4s, o4vvlmr4, 5hxsagx6, styav...","[8t2tic9n, 6mfd3n4s, o4vvlmr4, 5hxsagx6, styav...","[7.15625, 7.0546875, 7.0234375, 6.7734375, 6.5..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, yrowv62k, s2vckt2w, j1ucr...",1.0,"[r58aohnu, yrowv62k, tgd6gy3z, tu1vevx9, pdiyq...","[r58aohnu, yrowv62k, tgd6gy3z, tu1vevx9, pdiyq...","[9.3984375, 3.0859375, 2.765625, 2.6796875, 2...."
2,73,I recall early on reading that researchers who...,sts48u9i,"[sgo76prc, sts48u9i, tz2shoso, gruir7aw, 3xw4q...",1.0,"[sts48u9i, gruir7aw, o47v5vgw, o877uul1, mbam5...","[sts48u9i, gruir7aw, o47v5vgw, o877uul1, mbam5...","[6.203125, 6.0390625, 5.0625, 4.8046875, 3.953..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, 1cpjqav4, k0f4c...",1.0,"[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, kdegn...","[3sr2exq9, k0f4cwig, sv48gjkk, 8j3bb6zx, kdegn...","[9.6796875, 5.7578125, 5.171875, 4.1875, 2.546..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[3l6ipiwk, vabb2f26, ouvq2wpq, lzddnb8j, ybwwm...",1.0,"[ybwwmyqy, ierqfgo5, ouvq2wpq, 3l6ipiwk, vabb2...","[ybwwmyqy, ierqfgo5, ouvq2wpq, 3l6ipiwk, vabb2...","[7.421875, 6.25, 5.9765625, 5.46875, 5.109375,..."


Using GPU


Reranking: 100%|██████████| 1400/1400 [02:27<00:00,  9.48it/s]

k = 1
Number of queries in top 1: 778
Number of queries not in top 1: 622
k = 5
Number of queries in top 5: 978
Number of queries not in top 5: 422
k = 10
Number of queries in top 10: 1047
Number of queries not in top 10: 353
Evaluation results: {1: 0.5557142857142857, 5: 0.61075, 10: 0.6173647959183672}





In [23]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = "cross-encoder/ms-marco-electra-base"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet("data/reranked_results_electra.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:27<00:00,  4.27it/s]

k = 1
Number of queries in top 1: 717
Number of queries not in top 1: 683
k = 5
Number of queries in top 5: 932
Number of queries not in top 5: 468
k = 10
Number of queries in top 10: 1031
Number of queries not in top 10: 369
Evaluation results: {1: 0.5121428571428571, 5: 0.5711309523809525, 10: 0.58031179138322}





In [24]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
df_query = rerank_mxbai(df_query_dev_single, df_collection, "mixedbread-ai/mxbai-rerank-base-v2", batch_size=1)
df_query.to_parquet("data/reranked_results_mxbai.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Reranking:   0%|          | 0/1400 [00:00<?, ?it/s]You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Reranking: 100%|██████████| 1400/1400 [27:41<00:00,  1.19s/it]

k = 1
Number of queries in top 1: 963
Number of queries not in top 1: 437
k = 5
Number of queries in top 5: 1100
Number of queries not in top 5: 300
k = 10
Number of queries in top 10: 1123
Number of queries not in top 10: 277
Evaluation results: {1: 0.6878571428571428, 5: 0.7266190476190477, 10: 0.7287573696145124}





## Finetuning

This section shows the code and our experiments for the reranker finetuning. We used the sentence_transformers library to write the finetuning code.

In [26]:
from datasets import Dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss, MultipleNegativesRankingLoss, LambdaLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer


In [27]:
df_collection['answer'] = df_collection.apply(
    lambda row: f"{row['title']} {row['abstract']} {row['last_names']} {row['journal']}", axis=1
)
df_collection.head(3)   

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet,last_names,publish_year,answer
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600,van der Sande; Teunis; Sabel,2008,Professional and Home-Made Face Masks Reduce E...
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800,Li; Blakeley; Smith?,2011,The Failure of R (0) The basic reproductive ra...
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000,Singh; Sharma; Patel,2012,Pulmonary sequelae in a patient recovered from...


In [28]:
df_train = df_query_train.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_dev = df_query_dev.merge(df_collection[['cord_uid', 'answer']], on='cord_uid', how='left')
df_full = pd.concat([df_train, df_dev], ignore_index=True)
df_full.head(3)

Unnamed: 0,post_id,tweet_text,cord_uid,bm25_topk,in_topx,answer
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,"[htlvpvz5, h7hj64q5, 4aps0kvp, 5tkyir3r, 32z7b...",1.0,Oral Management in Rehabilitation Medicine: Or...
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,"[maj8r6ti, bjvg2ivr, 7tto4hr7, 2cwvga0k, 46je8...",0.003145,Variation in racial/ethnic disparities in COVI...
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"[jtwb17u8, veeavho5, jbpmbm9m, 8hkxbxz9, 32v44...",1.0,Effect of non-pharmaceutical interventions for...


In [29]:
train_dataset = Dataset.from_dict({
    "query": df_train['tweet_text'].tolist(),
    "document": df_train['answer'].tolist(),
})

dev_dataset = Dataset.from_dict({
    "query": df_dev['tweet_text'].tolist(),
    "document": df_dev['answer'].tolist(),
})
full_dataset = Dataset.from_dict({
    "query": df_full['tweet_text'].tolist(),
    "document": df_full['answer'].tolist(),
})

We use hard negative mining during training to give the model highly related but irrelevant documents such that the model is able to learn the differences between them.
Depending on our loss function we either feed pairs of (query, document, relevancy-label) or (query, [doc1, ...], [relevancy-label1, ...]) triplets.

The code for the training is relatively straightforward - its mostly just setting parameters. We save the best model according to the validation dataset. 

In [None]:
def get_hard_negatives(embedding_model_hard_negatives, num_hard_negatives, num_hard_negatives_eval, train_dataset, dev_dataset, df_full, batch_size=4096, loss='crossentropy', stricter_hard_negatives=False):

    embedding_model = SentenceTransformer(embedding_model_hard_negatives, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    if loss == 'crossentropy':
        output_format = "labeled-pair"
    elif loss == 'lambda':
        output_format = "labeled-list"
    else:
        raise ValueError("Invalid loss type. Use 'crossentropy' or 'lambda'.")
    
    if stricter_hard_negatives:
        hard_train_dataset = mine_hard_negatives(
            train_dataset,
            embedding_model,
            num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
            range_min=3,  # Skip the x most similar samples
            range_max=100,  # Consider only the x most similar samples
            max_score=0.95,  # Only consider samples with a similarity score of at most x
            margin=0.0,  # Similarity between query and negative samples should be x lower than query-positive similarity
            sampling_strategy="top",  # Randomly sample negatives from the range
            batch_size=batch_size,
            output_format=output_format, 
        )
    else:
        hard_train_dataset = mine_hard_negatives(
            train_dataset,
            embedding_model,
            num_negatives=num_hard_negatives,  # How many negatives per question-answer pair
            range_min=5,  # Skip the x most similar samples
            range_max=100,  # Consider only the x most similar samples
            max_score=0.8,  # Only consider samples with a similarity score of at most x
            margin=0.05,  # Similarity between query and negative samples should be x lower than query-positive similarity
            sampling_strategy="top",  # Randomly sample negatives from the range
            batch_size=batch_size,  
            output_format=output_format,
        )


    print("======================")
    hard_eval_dataset = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,  
        output_format=output_format,
    )

    print("======================")
    hard_eval_dataset_evaluator = mine_hard_negatives(
        dev_dataset,
        embedding_model,
        corpus=df_full["answer"],  # Use the full dataset as the corpus
        num_negatives=num_hard_negatives_eval,  # How many negatives per question-answer pair
        batch_size=batch_size,
        include_positives=True,  # Key: Include the positive answer in the list of negatives, needed for the evaluater to work correctly
    )

    return hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator


class ExperimentRunner:
    def __init__(self, model_to_finetune, train_batch_size, num_epochs, max_length, experiment_name):
        self.experiment_name = experiment_name
        self.model_to_finetune = model_to_finetune
        self.train_batch_size = train_batch_size
        self.num_epochs = num_epochs
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def run_experiment(
            self, 
            hard_train_dataset, 
            hard_eval_dataset, 
            hard_eval_dataset_evaluator, 
            num_hard_negatives, 
            trust_remote_code=False, 
            eval_steps=None, 
            loss='crossentropy',
            adaptive_learning_rate=False,
        ):

        if eval_steps is None:
            eval_steps = int(3000/self.train_batch_size)
        
        print(f"Eval steps: {eval_steps}")
        model = CrossEncoder(self.model_to_finetune, max_length=self.max_length, device=self.device, trust_remote_code=trust_remote_code)

        if adaptive_learning_rate:
            learning_rate = 2e-6
            weight_decay = 1e-7
            adam_epsilon = 1e-7
            warmup_ratio = 0.05
        else:
            learning_rate = 2e-5
            weight_decay = 0.0
            adam_epsilon = 1e-8
            warmup_ratio = 0.1

        args = CrossEncoderTrainingArguments(
            output_dir=f"model/{self.model_to_finetune.split("/")[-1]}{self.experiment_name}",
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.train_batch_size,
            per_device_eval_batch_size=self.train_batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            adam_epsilon=adam_epsilon,
            warmup_ratio=warmup_ratio,
            fp16=True,  
            bf16=False, 
            batch_sampler=BatchSamplers.NO_DUPLICATES, 
            eval_strategy="steps",
            eval_steps=eval_steps,
            save_strategy="steps",
            save_steps=eval_steps,
            save_total_limit=10,
            logging_steps=eval_steps,
            run_name=self.model_to_finetune.split("/")[-1],
            load_best_model_at_end=True,
            metric_for_best_model="eval_dev_set_mrr@10",
            greater_is_better=False,
            eval_on_start=True,
        )
        
        reranking_evaluator = CrossEncoderRerankingEvaluator(
            samples=[
                {
                    "query": sample["query"],
                    "positive": [sample["document"]],
                    "documents": [sample[column_name] for column_name in hard_eval_dataset_evaluator.column_names if 'negative' in column_name],
                }
                for sample in hard_eval_dataset_evaluator
            ],
            batch_size=self.train_batch_size,
            name="dev_set",
            show_progress_bar=True,
        )

        if loss == 'crossentropy':
            loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives))
        elif loss == 'lambda':
            loss = LambdaLoss(model=model, k=5)
        else:
            raise ValueError("Invalid loss type. Use 'crossentropy' or 'lambda'.")
        
        trainer = CrossEncoderTrainer(
            model=model,
            args=args,
            train_dataset=hard_train_dataset,
            eval_dataset=hard_eval_dataset,
            loss=loss,
            evaluator=reranking_evaluator,
    
        )

        trainer.train()

        model.save(f"models/{self.model_to_finetune.split('/')[-1]}-finetuned{self.experiment_name}")

In [None]:
num_epochs = 2
# embedding_model_hard_negatives = "sentence-transformers/static-retrieval-mrl-en-v1"
embedding_model_hard_negatives = "BAAI/bge-small-en-v1.5"
loss = 'lambda'
stricter_hard_negatives = True
adaptive_learning_rate = True
experiment_name = f"_{embedding_model_hard_negatives.split('/')[-1]}_epochs{num_epochs}_{loss}loss_learningrate"

This code actually mines the hard negatives.

In [None]:
num_hard_negatives = 5
hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(
    embedding_model_hard_negatives, 
    num_hard_negatives=num_hard_negatives, 
    num_hard_negatives_eval=5, 
    train_dataset=train_dataset, 
    dev_dataset=dev_dataset, 
    df_full=df_full, 
    batch_size=64, 
    loss=loss,
    stricter_hard_negatives=stricter_hard_negatives
    )

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.0`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:41<00:00,  2.60it/s]
Batches: 100%|██████████| 201/201 [00:09<00:00, 20.55it/s]


Metric       Positive       Negative     Difference
Count          12,853         56,528               
Mean           0.8195         0.7853         0.0519
Median         0.8329         0.7960         0.0405
Std            0.0826         0.0539         0.0473
Min            0.3809         0.5096        -0.0630
25%            0.7696         0.7537         0.0122
50%            0.8329         0.7960         0.0405
75%            0.8834         0.8233         0.0781
Max            0.9881         0.9207         0.2992
Skipped 237,587 potential negatives (17.96%) due to the absolute_margin of 0.0.
Skipped 4 potential negatives (0.00%) due to the max_score of 0.95.
Could not find enough negatives for 7737 samples (12.04%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positiv

Batches: 100%|██████████| 121/121 [00:46<00:00,  2.60it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 21.45it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8033         0.0186
Median         0.8323         0.8101         0.0168
Std            0.0783         0.0505         0.0640
Min            0.4977         0.5529        -0.1856
25%            0.7717         0.7739        -0.0220
50%            0.8323         0.8101         0.0168
75%            0.8846         0.8388         0.0593
Max            0.9681         0.9451         0.2385
Setting range_max to 7 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:47<00:00,  2.56it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 21.51it/s]


Metric       Positive       Negative     Difference
Count           1,400          7,000               
Mean           0.8219         0.8117         0.0102
Median         0.8323         0.8156         0.0000
Std            0.0783         0.0543         0.0593
Min            0.4977         0.5593        -0.1856
25%            0.7717         0.7790        -0.0220
50%            0.8323         0.8156         0.0000
75%            0.8846         0.8466         0.0443
Max            0.9681         0.9681         0.2377


In [24]:
hard_train_dataset

Dataset({
    features: ['query', 'document', 'labels'],
    num_rows: 11322
})

In [25]:
hard_eval_dataset

Dataset({
    features: ['query', 'document', 'labels'],
    num_rows: 1400
})

In [26]:
hard_eval_dataset_evaluator

Dataset({
    features: ['query', 'document', 'negative_1', 'negative_2', 'negative_3', 'negative_4', 'negative_5'],
    num_rows: 1400
})

### Experiments
In the following cells we finetune each of the models based on the parameters defined above. We set the batch size as high as possible such that we (just barely) dont run into OOM errors.

In [None]:
train_batch_size = 128
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-TinyBERT-L2-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

Eval steps: 22


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.365992,0.756345,0.756345,0.815318,0.568988,0.568988,0.595057
22,0.688500,1.36159,0.754798,0.754798,0.814166,0.568988,0.568988,0.595057
44,0.677600,1.346526,0.753988,0.753988,0.813541,0.568988,0.568988,0.595057
66,0.616600,1.34266,0.754012,0.754012,0.813548,0.568988,0.568988,0.595057
88,0.584200,1.340421,0.754702,0.754702,0.814057,0.568988,0.568988,0.595057
110,0.602200,1.327426,0.753988,0.753988,0.813503,0.568988,0.568988,0.595057


                                                                        

In [None]:
train_batch_size = 4
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L12-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

Eval steps: 750


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.317779,0.740036,0.740036,0.802646,0.568988,0.568988,0.595057
750,0.237300,1.212988,0.78375,0.78375,0.835814,0.568988,0.568988,0.595057
1500,0.088200,1.292978,0.788357,0.788357,0.839327,0.568988,0.568988,0.595057
2250,0.082200,1.354321,0.788595,0.788595,0.839466,0.568988,0.568988,0.595057
3000,0.071900,1.403692,0.78844,0.78844,0.839341,0.568988,0.568988,0.595057
3750,0.078000,1.422006,0.790524,0.790524,0.840919,0.568988,0.568988,0.595057


                                                                       

In [None]:
train_batch_size = 8
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-MiniLM-L6-v2"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

Eval steps: 375


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.235787,0.759298,0.759298,0.817261,0.568988,0.568988,0.595057
375,0.250400,1.146599,0.777667,0.777667,0.831213,0.568988,0.568988,0.595057
750,0.143700,1.216257,0.782012,0.782012,0.834577,0.568988,0.568988,0.595057
1125,0.120500,1.275048,0.78419,0.78419,0.836246,0.568988,0.568988,0.595057
1500,0.097000,1.318414,0.78556,0.78556,0.837306,0.568988,0.568988,0.595057
1875,0.108400,1.317869,0.785512,0.785512,0.837261,0.568988,0.568988,0.595057


                                                                       

In [None]:

train_batch_size = 1
max_length = 512
model_to_finetune = "cross-encoder/ms-marco-electra-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,,0.716131,0.716131,0.784809,0.568988,0.568988,0.595057
3000,0.275200,,0.774929,0.774929,0.829386,0.568988,0.568988,0.595057
6000,0.087500,,0.775321,0.775321,0.829746,0.568988,0.568988,0.595057
9000,0.075400,,0.775964,0.775964,0.830208,0.568988,0.568988,0.595057
12000,0.042900,,0.783214,0.783214,0.835638,0.568988,0.568988,0.595057
15000,0.040800,,0.775857,0.776214,0.83043,0.568988,0.568988,0.595057


                                                                       

In [None]:
train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-reranker-modernbert-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)

exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,,0.838726,0.838726,0.878283,0.568988,0.568988,0.595057
3000,0.397400,,0.806345,0.832417,0.875308,0.568988,0.568988,0.595057
6000,0.338600,,0.839536,0.859417,0.891746,0.568988,0.568988,0.595057
9000,0.343900,,0.837167,0.858,0.889666,0.568988,0.568988,0.595057
12000,0.246700,,0.799464,0.830179,0.875662,0.568988,0.568988,0.595057
15000,0.154600,,0.809155,0.83719,0.880746,0.568988,0.568988,0.595057
18000,0.156500,,0.817738,0.841905,0.881764,0.568988,0.568988,0.595057
21000,0.143800,,0.804155,0.834095,0.877099,0.568988,0.568988,0.595057


                                                                       

In [None]:
num_hard_negatives = 2
num_hard_negatives_eval = 2

hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator = get_hard_negatives(
    embedding_model_hard_negatives, 
    num_hard_negatives=num_hard_negatives, 
    num_hard_negatives_eval=num_hard_negatives_eval, 
    train_dataset=train_dataset, 
    dev_dataset=dev_dataset, 
    df_full=df_full, 
    loss=loss, 
    batch_size=64,
    stricter_hard_negatives=stricter_hard_negatives
    )

train_batch_size = 1
max_length = 512
model_to_finetune = "Alibaba-NLP/gte-multilingual-reranker-base"

exp_runner = ExperimentRunner(
    model_to_finetune,
    train_batch_size,
    num_epochs,
    max_length,
    experiment_name=experiment_name
)
exp_runner.run_experiment(hard_train_dataset, hard_eval_dataset, hard_eval_dataset_evaluator, num_hard_negatives, trust_remote_code=True, loss=loss, adaptive_learning_rate=adaptive_learning_rate)

The `margin` parameter is deprecated. Use the `absolute_margin` and/or `relative_margin` parameter instead. Setting `absolute_margin` to `0.05`.


Found 12842 unique queries out of 12853 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 109/109 [00:41<00:00,  2.65it/s]
Batches: 100%|██████████| 201/201 [00:09<00:00, 21.41it/s]


Metric       Positive       Negative     Difference
Count          12,853         15,746               
Mean           0.8195         0.7637         0.0914
Median         0.8329         0.7830         0.0814
Std            0.0826         0.0427         0.0388
Min            0.3809         0.5047        -0.0924
25%            0.7696         0.7428         0.0570
50%            0.8329         0.7830         0.0814
75%            0.8834         0.7956         0.1156
Max            0.9881         0.7998         0.2959
Skipped 561,379 potential negatives (42.44%) due to the absolute_margin of 0.05.
Skipped 146,681 potential negatives (19.27%) due to the max_score of 0.8.
Could not find enough negatives for 9960 samples (38.75%). Consider adjusting the range_max, range_min, absolute_margin and max_score parameters if you'd like to find more valid negatives.
Setting range_max to 4 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 

Batches: 100%|██████████| 121/121 [00:45<00:00,  2.64it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.23it/s]
When using `include_positives=True`, `output_format` will be set to `"n-tuple"` to ensure that the ranking order is preserved.


Metric       Positive       Negative     Difference
Count           1,400          2,800               
Mean           0.8219         0.8132         0.0087
Median         0.8323         0.8191         0.0069
Std            0.0783         0.0501         0.0625
Min            0.4977         0.5707        -0.1856
25%            0.7717         0.7844        -0.0299
50%            0.8323         0.8192         0.0069
75%            0.8846         0.8478         0.0478
Max            0.9681         0.9451         0.2158
Setting range_max to 4 based on the provided parameters.
Found 1399 unique queries out of 1400 total queries.
Found an average of 1.001 positives per query.


Batches: 100%|██████████| 121/121 [00:45<00:00,  2.64it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.19it/s]


Metric       Positive       Negative     Difference
Count           1,400          2,800               
Mean           0.8219         0.8294        -0.0075
Median         0.8323         0.8332        -0.0000
Std            0.0783         0.0559         0.0501
Min            0.4977         0.5787        -0.1856
25%            0.7717         0.7953        -0.0299
50%            0.8323         0.8332        -0.0000
75%            0.8846         0.8678         0.0004
Max            0.9681         0.9681         0.1977
Eval steps: 3000


Step,Training Loss,Validation Loss,Dev Set Map,Dev Set Mrr@10,Dev Set Ndcg@10,Dev Set Base Map,Dev Set Base Mrr@10,Dev Set Base Ndcg@10
0,No log,1.872254,0.874524,0.874524,0.906698,0.543929,0.543929,0.554684
3000,0.226200,2.880633,0.858929,0.858929,0.895107,0.543929,0.543929,0.554684
6000,0.095300,3.151024,0.857024,0.857024,0.893594,0.543929,0.543929,0.554684
9000,0.051500,3.20767,0.863929,0.863929,0.898713,0.543929,0.543929,0.554684
12000,0.037800,3.293346,0.86631,0.86631,0.900465,0.543929,0.543929,0.554684
15000,0.018200,3.282988,0.867024,0.867024,0.901009,0.543929,0.543929,0.554684


                                                                       

### Reranking with finetuned models

Now, after finetuning the models, we are able to run the inference with the newly trained models. Since we are making use of computationally intensive rerankers, we first need to rank the results with an efficient first-stage ranker. We use our best performing BM25 for this purpose. We feed the top 30 documents according to BM25 into the reranking model, save the predictions and print the evaluation metrics.

In [None]:
df_query_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-reranker-modernbert-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:42<00:00,  3.48it/s]

k = 1
Number of queries in top 1: 980
Number of queries not in top 1: 420
k = 5
Number of queries in top 5: 1113
Number of queries not in top 5: 287
k = 10
Number of queries in top 10: 1123
Number of queries not in top 10: 277
Evaluation results: {1: 0.7, 5: 0.7398095238095238, 10: 0.7407922335600907}





In [25]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/gte-multilingual-reranker-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f'data/reranked_results_alibaba_multilingual_finetuned{experiment_name}.parquet')
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [06:03<00:00,  3.86it/s]

k = 1
Number of queries in top 1: 890
Number of queries not in top 1: 510
k = 5
Number of queries in top 5: 1088
Number of queries not in top 5: 312
k = 10
Number of queries in top 10: 1118
Number of queries not in top 10: 282
Evaluation results: {1: 0.6357142857142857, 5: 0.693952380952381, 10: 0.6968236961451246}





In [26]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L12-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM12_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [02:27<00:00,  9.48it/s]

k = 1
Number of queries in top 1: 874
Number of queries not in top 1: 526
k = 5
Number of queries in top 5: 1036
Number of queries not in top 5: 364
k = 10
Number of queries in top 10: 1088
Number of queries not in top 10: 312
Evaluation results: {1: 0.6242857142857143, 5: 0.6715, 10: 0.6765413832199546}





In [27]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-MiniLM-L6-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_miniLM6_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [01:45<00:00, 13.24it/s]

k = 1
Number of queries in top 1: 861
Number of queries not in top 1: 539
k = 5
Number of queries in top 5: 1037
Number of queries not in top 5: 363
k = 10
Number of queries in top 10: 1090
Number of queries not in top 10: 310
Evaluation results: {1: 0.615, 5: 0.6652380952380952, 10: 0.6704767573696144}





In [28]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-TinyBERT-L2-v2-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=100)
df_query.to_parquet(f"data/reranked_results_tinyBERT_finetuned{experiment_name}.parquet")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [01:09<00:00, 20.11it/s]

k = 1
Number of queries in top 1: 794
Number of queries not in top 1: 606
k = 5
Number of queries in top 5: 995
Number of queries not in top 5: 405
k = 10
Number of queries in top 10: 1059
Number of queries not in top 10: 341
Evaluation results: {1: 0.5671428571428572, 5: 0.6222619047619047, 10: 0.6285807823129252}





In [29]:
df_query_dev_single = experiment_single_bm25(df_collection, df_query_dev, stemmer=stemmer, k=30)
model_name = f"models/ms-marco-electra-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_dev_single, df_collection, tokenizer, model, k=10, batch_size=50, max_length=None )
df_query.to_parquet(f"data/reranked_results_electra_finetuned.parquet{experiment_name}")
results = evaluate_reranked_results(df_query)
print(f"Evaluation results: {results}")

                                                                            

Using GPU


Reranking: 100%|██████████| 1400/1400 [05:38<00:00,  4.14it/s]

k = 1
Number of queries in top 1: 872
Number of queries not in top 1: 528
k = 5
Number of queries in top 5: 1055
Number of queries not in top 5: 345
k = 10
Number of queries in top 10: 1106
Number of queries not in top 10: 294
Evaluation results: {1: 0.6228571428571429, 5: 0.6750357142857143, 10: 0.6799353741496599}





## Predictions on the testset
We use our best performing finetuned model and let it rerank the top 1k BM25 documents. Since the finetuned model is significantly better than BM25, feeding more documents into the reranker should lead to a better performance.

In [None]:
df_query_single = experiment_single_bm25(df_collection, df_query_test, stemmer=stemmer, k=1000)
model_name = f"models/gte-reranker-modernbert-base-finetuned{experiment_name}"
tokenizer, model = initialize_reranker(model_name)
df_query = rerank_with_alibaba(df_query_single, df_collection, tokenizer, model, k=10, batch_size=256)
df_query.to_parquet(f'data/reranked_results_alibaba_finetuned{experiment_name}_testset.parquet')

                                                                            

Using GPU


Reranking: 100%|██████████| 1446/1446 [3:59:53<00:00,  9.95s/it]  


KeyError: 'cord_uid'

In [36]:
df_query['preds'] = df_query['reranked_topk'].apply(lambda x: x[:5])

In [37]:
df_query

Unnamed: 0,post_id,tweet_text,bm25_topk,reranked_topk,reranked_docs,reranked_scores,preds
0,1,A recent research study published yesterday cl...,"[bttme4wn, 8fkzc445, nswj8x43, uifnjio5, j0bu0...","[x4zuv4jo, j0bu0upi, tpic8ddl, qgwu9fsk, 1qvkl...","[x4zuv4jo, j0bu0upi, tpic8ddl, qgwu9fsk, 1qvkl...","[6.66796875, 5.98046875, 5.875, 4.734375, 3.42...","[x4zuv4jo, j0bu0upi, tpic8ddl, qgwu9fsk, 1qvkl..."
1,2,"""We should track the long-term effects of thes...","[evf9nz05, 65n6p550, f6vau5s6, mnsm39a8, 5vp2r...","[evf9nz05, 7keaoi4d, vtagfrds, ot1uz6vc, mnsm3...","[evf9nz05, 7keaoi4d, vtagfrds, ot1uz6vc, mnsm3...","[25.59375, -6.83984375, -8.2265625, -9.546875,...","[evf9nz05, 7keaoi4d, vtagfrds, ot1uz6vc, mnsm3..."
2,3,"the agony of ""long haul"" covid-19 symptoms.","[376x58k1, 00ugdhvf, e6m1tkrs, ofavwj52, t2gxk...","[2kd89h12, m3m2n3fw, 00ugdhvf, ky5env7t, 82y56...","[2kd89h12, m3m2n3fw, 00ugdhvf, ky5env7t, 82y56...","[24.421875, 23.828125, 20.203125, 14.3828125, ...","[2kd89h12, m3m2n3fw, 00ugdhvf, ky5env7t, 82y56..."
3,4,Home and online monitoring and assessment of b...,"[ru2ty1y9, wabd3b9z, enlj85zc, bnkggl84, 609b8...","[ru2ty1y9, wabd3b9z, ir874dkj, mim419b8, oqd5o...","[ru2ty1y9, wabd3b9z, ir874dkj, mim419b8, oqd5o...","[36.71875, -6.00390625, -10.2578125, -14.85156...","[ru2ty1y9, wabd3b9z, ir874dkj, mim419b8, oqd5o..."
4,5,"it may be a long one, folks! to avoid exceedin...","[f5p37j7g, nzat41wu, jnz5by5w, n0uy6hd2, g283l...","[f5p37j7g, nzat41wu, 9mact9br, l70ghm8g, zsra2...","[f5p37j7g, nzat41wu, 9mact9br, l70ghm8g, zsra2...","[22.875, 16.796875, -2.7890625, -5.921875, -10...","[f5p37j7g, nzat41wu, 9mact9br, l70ghm8g, zsra2..."
...,...,...,...,...,...,...,...
1441,1442,"Clinical presentations, predisposing factors, ...","[ohyvuybc, 06fwhyac, x64zft78, 18b6ikq3, 5rpd8...","[ohyvuybc, 06fwhyac, x64zft78, vq9m9m94, 18b6i...","[ohyvuybc, 06fwhyac, x64zft78, vq9m9m94, 18b6i...","[33.53125, 20.203125, 14.453125, 12.9296875, 4...","[ohyvuybc, 06fwhyac, x64zft78, vq9m9m94, 18b6i..."
1442,1443,risk factors for post-covid-19 condition in ho...,"[pwb7rw89, 32ut5vr7, ce1a9k8b, 5dk3pslc, 8ymu4...","[32ut5vr7, pwb7rw89, hqjphv5g, dh3zqjbz, dmqk2...","[32ut5vr7, pwb7rw89, hqjphv5g, dh3zqjbz, dmqk2...","[30.890625, 7.5, 2.689453125, -1.810546875, -7...","[32ut5vr7, pwb7rw89, hqjphv5g, dh3zqjbz, dmqk2..."
1443,1444,do not assume children are less susceptible.,"[st67fvgk, 9rczqcaz, wtxhbzr9, 7ru6tapp, du8wm...","[9rczqcaz, 35xqlryc, uo3ww8j4, t3b2adct, st67f...","[9rczqcaz, 35xqlryc, uo3ww8j4, t3b2adct, st67f...","[19.140625, 14.7578125, 11.8046875, 10.65625, ...","[9rczqcaz, 35xqlryc, uo3ww8j4, t3b2adct, st67f..."
1444,1445,eurosurveillance | estimated number of fatalit...,"[leg5ntvu, bnhwbaqj, qlejjw9s, 6l3coibe, 2cwvg...","[leg5ntvu, o1xsjx2p, 79xnx9gr, bnhwbaqj, l6lhm...","[leg5ntvu, o1xsjx2p, 79xnx9gr, bnhwbaqj, l6lhm...","[36.78125, -12.453125, -14.1796875, -15.617187...","[leg5ntvu, o1xsjx2p, 79xnx9gr, bnhwbaqj, l6lhm..."


Ultimately we save the predictions into the format required by the competition.

In [42]:
df_query[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')

In [43]:
experiment_name

'_bge-small-en-v1.5_epochs2_lambdaloss_learningrate'