In [66]:
from pathlib import Path
import pandas as pd

In [67]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
        print(f"{k = }")
        in_topx = data["in_topx"] > 0
        print(f"Number of queries in top {k}: {in_topx.sum()}")
        print(f"Number of queries not in top {k}: {len(data) - in_topx.sum()}")
    return d_performance

In [68]:
def evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10]):
    return get_performance_mrr(df_query, col_gold, col_pred, list_k)

In [69]:
path = Path('data')
files = list(path.glob('*.parquet'))

results = []
for file in files:
    print(f"Processing file: {file.name}")
    df_query = pd.read_parquet(file)
    if 'reranked_topk' in df_query.columns:
        if 'cord_uid' not in df_query.columns:
            print(f"Column 'cord_uid' not found in {file.name}. Skipping evaluation.")
            continue
        performance = evaluate_reranked_results(df_query, col_gold='cord_uid', col_pred='reranked_topk', list_k=[1, 5, 10])
        results.append({
            'name': file.name,
            "MRR@1": performance[1],
            "MRR@5": performance[5],
            "MRR@10": performance[10]
        })
    else:
        print(f"Column 'reranked_topk' not found in {file.name}. Skipping evaluation.")

df_eval = pd.DataFrame(results, columns=['name', "MRR@1", "MRR@5", "MRR@10"])
df_eval


Processing file: reranked_results_alibaba_finetuned_bge-small-en-v1.5_epochs2_crossentropyloss.parquet
k = 1
Number of queries in top 1: 912
Number of queries not in top 1: 488
k = 5
Number of queries in top 5: 1078
Number of queries not in top 5: 322
k = 10
Number of queries in top 10: 1111
Number of queries not in top 10: 289
Processing file: reranked_results_miniLM12_finetuned_bge-small-en-v1.5_epochs2_crossentropyloss.parquet
k = 1
Number of queries in top 1: 912
Number of queries not in top 1: 488
k = 5
Number of queries in top 5: 1072
Number of queries not in top 5: 328
k = 10
Number of queries in top 10: 1109
Number of queries not in top 10: 291
Processing file: reranked_results_tinyBERT_finetuned_bge-small-en-v1.5_epochs2_crossentropyloss.parquet
k = 1
Number of queries in top 1: 788
Number of queries not in top 1: 612
k = 5
Number of queries in top 5: 1010
Number of queries not in top 5: 390
k = 10
Number of queries in top 10: 1071
Number of queries not in top 10: 329
Processi

Unnamed: 0,name,MRR@1,MRR@5,MRR@10
0,reranked_results_alibaba_finetuned_bge-small-e...,0.651429,0.698833,0.702113
1,reranked_results_miniLM12_finetuned_bge-small-...,0.651429,0.697357,0.700896
2,reranked_results_tinyBERT_finetuned_bge-small-...,0.562857,0.625786,0.631567
3,reranked_results_alibaba_multilingual_finetune...,0.646429,0.691655,0.695531
4,reranked_results_alibaba_finetuned_bge-small-e...,0.671429,0.715643,0.718431
5,reranked_results_miniLM12_finetuned_bge-small-...,0.654286,0.698536,0.701527
6,reranked_results_tinyBERT_finetuned_bge-small-...,0.563571,0.624845,0.631527
7,reranked_results_electra_finetuned_bge-small-e...,0.642857,0.692107,0.695505
8,reranked_results_electra_finetuned_bge-small-e...,0.524286,0.616512,0.621618
9,reranked_results_miniLM6_finetuned_bge-small-e...,0.637143,0.688429,0.691478


In [70]:
df = df_eval.copy()

In [71]:
df['is_finetuned'] = df['name'].apply(lambda x: 'finetuned' in x)
df
# remove 'finetuned from name
df['name'] = df['name'].apply(lambda x: x.replace('finetuned', '').replace('.parquet', '').replace('reranked_results_', '').strip('_'))


In [72]:
def extract_loss_type(filename):
    if 'lambdaloss' in filename:
        return 'lambda'
    elif 'crossentropyloss' in filename:
        return 'crossentropy'
    else:
        return 'unknown'
    
df['loss_type'] = df['name'].apply(extract_loss_type)

# remove 'lambdaloss' and 'crossentropyloss' from name
df['name'] = df['name'].apply(lambda x: x.replace('lambdaloss', '').replace('crossentropyloss', '').strip('_'))


In [73]:
def extract_embedding_model(filename):
    if 'static-retrieval-mrl-en-v1' in filename:
        return 'static-retrieval-mrl-en-v1'
    elif 'bge-small-en-v1.5' in filename:
        return 'bge-small-en-v1.5'
    else:
        return 'unknown'
    
df['embedding_model'] = df['name'].apply(extract_embedding_model)


# remove 'retrieval-mrl-en-v1' and 'bge-small-en-v1.5' from name
df['name'] = df['name'].apply(lambda x: x.replace('static-retrieval-mrl-en-v1', '').replace('bge-small-en-v1.5', '').strip('_'))


In [74]:
def extract_num_epochs_model(filename):
    if 'epochs2' in filename:
        return 'epochs2'
    elif 'epochs1' in filename:
        return 'epochs1'
    else:
        return 'unknown'
    
df['epochs'] = df['name'].apply(extract_num_epochs_model)


# remove 'retrieval-mrl-en-v1' and 'bge-small-en-v1.5' from name
df['name'] = df['name'].apply(lambda x: x.replace('epochs2', '').replace('epochs1', '').strip('_'))


In [75]:
df['has_stricter_embedding'] = df['name'].apply(lambda x: 'stricter embedding' in x)
df
# remove 'finetuned from name
df['name'] = df['name'].apply(lambda x: x.replace('stricter embedding', '').strip('_'))


In [76]:
df['learningrate'] = df['name'].apply(lambda x: 'learningrate' in x)
df
# remove 'finetuned from name
df['name'] = df['name'].apply(lambda x: x.replace('learningrate', '').strip('_'))



# learning rate has stricter embedding

In [77]:
def extractk(filename):
    if 'k1000' in filename:
        return '1000'
    elif 'k100' in filename:
        return '100'
    else:
        return '30'
    
df['k'] = df['name'].apply(extractk)


# remove 'retrieval-mrl-en-v1' and 'bge-small-en-v1.5' from name
df['name'] = df['name'].apply(lambda x: x.replace('k1000', '').replace('k100', '').strip('_'))


In [78]:
# get only lines where is_finetuned is false
df_base = df[df['is_finetuned'] == False].reset_index(drop=True).sort_values(by=['name'])
df_base

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.675,0.717571,0.720374,False,unknown,unknown,unknown,False,False,30
1,alibaba_multilingual,0.641429,0.69175,0.694914,False,unknown,unknown,unknown,False,False,30
2,electra,0.512143,0.571131,0.580312,False,unknown,unknown,unknown,False,False,30
3,miniLM12,0.555714,0.61075,0.617365,False,unknown,unknown,unknown,False,False,30
4,miniLM6,0.576429,0.630452,0.636363,False,unknown,unknown,unknown,False,False,30
5,mxbai,0.687857,0.726619,0.728757,False,unknown,unknown,unknown,False,False,30
6,tinyBERT,0.56,0.617786,0.624103,False,unknown,unknown,unknown,False,False,30


In [79]:
# get only lines where has_stricter_embedding is False, embedding_model is unknown or static-retrieval-mrl-en-v1
df_finetuning = df[(~df['has_stricter_embedding']) & (df['embedding_model'].isin(['unknown', 'static-retrieval-mrl-en-v1']))].sort_values(by=['name', 'is_finetuned']).reset_index(drop=True)
df_finetuning


Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.675,0.717571,0.720374,False,unknown,unknown,unknown,False,False,30
1,alibaba,0.673571,0.716905,0.719349,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
2,alibaba_multilingual,0.641429,0.69175,0.694914,False,unknown,unknown,unknown,False,False,30
3,alibaba_multilingual,0.63,0.683357,0.687158,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
4,electra,0.512143,0.571131,0.580312,False,unknown,unknown,unknown,False,False,30
5,electra,0.642857,0.69531,0.69788,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
6,miniLM12,0.555714,0.61075,0.617365,False,unknown,unknown,unknown,False,False,30
7,miniLM12,0.660714,0.70181,0.706007,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
8,miniLM6,0.576429,0.630452,0.636363,False,unknown,unknown,unknown,False,False,30
9,miniLM6,0.652857,0.698857,0.702219,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30


In [80]:
df_embedding = df[(df['is_finetuned'] == True) & (df['loss_type'] == "crossentropy") & (~df["has_stricter_embedding"])].sort_values(by=['name']).reset_index(drop=True)
df_embedding

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.651429,0.698833,0.702113,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
1,alibaba,0.673571,0.716905,0.719349,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
2,alibaba_multilingual,0.646429,0.691655,0.695531,True,crossentropy,bge-small-en-v1.5,epochs1,False,False,30
3,alibaba_multilingual,0.63,0.683357,0.687158,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
4,electra,0.642857,0.692107,0.695505,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
5,electra,0.642857,0.69531,0.69788,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
6,miniLM12,0.651429,0.697357,0.700896,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
7,miniLM12,0.660714,0.70181,0.706007,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30
8,miniLM6,0.637143,0.688429,0.691478,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
9,miniLM6,0.652857,0.698857,0.702219,True,crossentropy,static-retrieval-mrl-en-v1,epochs2,False,False,30


In [81]:
df_loss = df[(df['is_finetuned'] == True) & (~df["has_stricter_embedding"]) & (df['embedding_model'] == 'bge-small-en-v1.5') & (~df["learningrate"])].sort_values(by=['name']).reset_index(drop=True)
df_loss

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.651429,0.698833,0.702113,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
1,alibaba,0.679286,0.720357,0.722709,True,lambda,bge-small-en-v1.5,epochs2,False,False,30
2,alibaba_multilingual,0.646429,0.691655,0.695531,True,crossentropy,bge-small-en-v1.5,epochs1,False,False,30
3,alibaba_multilingual,0.635714,0.693952,0.696824,True,lambda,bge-small-en-v1.5,epochs2,False,False,30
4,electra,0.642857,0.692107,0.695505,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
5,electra,0.622857,0.675036,0.679935,True,lambda,bge-small-en-v1.5,epochs2,False,False,30
6,miniLM12,0.651429,0.697357,0.700896,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
7,miniLM12,0.624286,0.6715,0.676541,True,lambda,bge-small-en-v1.5,epochs2,False,False,30
8,miniLM6,0.637143,0.688429,0.691478,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
9,miniLM6,0.615,0.665238,0.670477,True,lambda,bge-small-en-v1.5,epochs2,False,False,30


In [82]:
df_strict_embeddings = df[(df['is_finetuned'] == True) & (df['loss_type'] == 'crossentropy') & (~df['learningrate']) & (df['embedding_model'] == 'bge-small-en-v1.5') & (df['name'] != 'alibaba_multilingual')].sort_values(by=['name']).reset_index(drop=True)
df_strict_embeddings

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.651429,0.698833,0.702113,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
1,alibaba,0.671429,0.715643,0.718431,True,crossentropy,bge-small-en-v1.5,epochs2,True,False,30
2,electra,0.642857,0.692107,0.695505,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
3,electra,0.524286,0.616512,0.621618,True,crossentropy,bge-small-en-v1.5,epochs2,True,False,30
4,miniLM12,0.651429,0.697357,0.700896,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
5,miniLM12,0.654286,0.698536,0.701527,True,crossentropy,bge-small-en-v1.5,epochs2,True,False,30
6,miniLM6,0.637143,0.688429,0.691478,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
7,miniLM6,0.644286,0.691679,0.694902,True,crossentropy,bge-small-en-v1.5,epochs2,True,False,30
8,tinyBERT,0.562857,0.625786,0.631567,True,crossentropy,bge-small-en-v1.5,epochs2,False,False,30
9,tinyBERT,0.563571,0.624845,0.631527,True,crossentropy,bge-small-en-v1.5,epochs2,True,False,30


In [83]:
df_learning_rate = df[(df['is_finetuned'] == True) & (df['loss_type'] == 'lambda')  & (df['embedding_model'] == 'bge-small-en-v1.5') & (df['name'] == 'alibaba') & (df['k'] == '30')].sort_values(by=['name']).reset_index(drop=True)
df_learning_rate

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.679286,0.720357,0.722709,True,lambda,bge-small-en-v1.5,epochs2,False,False,30
1,alibaba,0.7,0.73981,0.740792,True,lambda,bge-small-en-v1.5,epochs2,False,True,30


In [84]:
df_k = df[(df['is_finetuned'] == True) & (df['loss_type'] == 'lambda')  & (df['embedding_model'] == 'bge-small-en-v1.5') & (df['name'] == 'alibaba') & (df['learningrate'])].sort_values(by=['name']).reset_index(drop=True)
df_k

Unnamed: 0,name,MRR@1,MRR@5,MRR@10,is_finetuned,loss_type,embedding_model,epochs,has_stricter_embedding,learningrate,k
0,alibaba,0.7,0.73981,0.740792,True,lambda,bge-small-en-v1.5,epochs2,False,True,30
1,alibaba,0.712143,0.75656,0.759488,True,lambda,bge-small-en-v1.5,epochs2,False,True,100
2,alibaba,0.724286,0.772524,0.775613,True,lambda,bge-small-en-v1.5,epochs2,False,True,1000
