# Fusion Retrieval with Reciprocal Rank Fusion (RRF)

In [1]:
import sys
sys.path.append('/work/utilis')

from lib_to_use import *
from general_functions import *
from paths import *

  from .autonotebook import tqdm as notebook_tqdm
Torch version:  2.6.0+cu124
Pyterrier version:  0.13.0
  demoji.download_codes()


In [2]:
qrels_train = load_qrels(qrels_train_path)
qrels_val = load_qrels(qrels_val_path)
qrels_test = load_qrels(qrels_test_path)
qrels_train_df = qrels_to_df(qrels_train)
qrels_val_df = qrels_to_df(qrels_val)
qrels_test_df = qrels_to_df(qrels_test)

In [3]:
with open(test_processed_path, "rb") as f:
    df_test = pickle.load(f)

with open(val_processed_path, "rb") as f:
    df_val = pickle.load(f)

with open(train_processed_path, "rb") as f:
    df_train = pickle.load(f)

with open("/work/PIR_data_unzip/PIR_data/answer_retrieval/df_all_processed.pkl", "rb") as f:
    df_all = pickle.load(f)
# Create a PyTerrier index
index_path = "./pyterrier_index"
# indexer = pt.IterDictIndexer(index_path, overwrite=True,meta={'docno': 50, 'text': 10000})

# # Index only the document collection (answers)
# corpus = df_all[['docno', 'text']].to_dict('records')
# # rimuovi i duplicati docno
# index_ref = indexer.index(corpus)

# Load the index previously built (using the IndexFactory)
index_ref = pt.IndexFactory.of(index_path)

Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [4]:
# Carica i risultati dei modelli in DataFrame
# Si assume che ogni DataFrame abbia le colonne: 'qid', 'docno' e 'score'
with open('/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/bm25_optimized_results.pkl', 'rb') as f:
    results_bm25 =  pickle.load(f)

with open('/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/bm25_minilm_results.pkl', 'rb') as f:
    results_bm25_minilm =  pickle.load(f)

with open('/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/bm25_distilbert_qa_results.pkl', 'rb') as f:
    results_bm25_distilbertqa =  pickle.load(f)

with open('/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/bm25_T5_results.pkl', 'rb') as f:
    results_bm25_t5 =  pickle.load(f)

In [5]:
def assign_rank(df):
    """
    Assigns a rank to each document per query based on the 'score' in descending order.
    """
    df = df.copy()
    df['rank'] = df.groupby('qid')['score'].rank(method='min', ascending=False)
    return df

def compute_rrf(df, k=200):
    """
    Calcola il Reciprocal Rank Fusion (RRF) score per documento.
    RRF = 1 / (k + rank)
    """
    df = assign_rank(df)
    df['rrf'] = 1.0 / (k + df['rank'])
    return df[['qid', 'docno', 'rrf']]

# Calcola i punteggi RRF per ogni sistema
bm25_rrf = compute_rrf(results_bm25, k=60)
minilm_rrf = compute_rrf(results_bm25_minilm, k=60)
distilbert_rrf = compute_rrf(results_bm25_distilbertqa, k=60)
t5_rrf = compute_rrf(results_bm25_t5, k=60)

In [6]:
# Unisci i risultati: creiamo un DataFrame con l'unione dei docno per ogni query
all_results = pd.concat([bm25_rrf, minilm_rrf, distilbert_rrf, t5_rrf])

# Per ogni coppia (qid, docno), somma i punteggi RRF provenienti dai vari sistemi
fusion_scores = all_results.groupby(['qid', 'docno'], as_index=False)['rrf'].sum()

# Ordina i risultati finali per ogni query in base al punteggio di fusione (dal più alto al più basso)
fusion_scores = fusion_scores.sort_values(['qid', 'rrf'], ascending=[True, False])

with open ('/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/fusion_results.pkl', 'wb') as f:
    pickle.dump(fusion_scores, f)

# Visualizza i risultati finali
print("Risultati della fusion retrieval:")
fusion_scores

Risultati della fusion retrieval:


Unnamed: 0,qid,docno,rrf
6,academia_143743,academia_12035,0.060640
961,academia_143743,workplace_9502,0.060499
779,academia_143743,scifi_117801,0.059976
7,academia_143743,academia_122342,0.058911
925,academia_143743,workplace_102543,0.058898
...,...,...,...
94932,writers_51676,martialarts_7575,0.000947
95202,writers_51676,scifi_22958,0.000946
94957,writers_51676,movies_17810,0.000945
94527,writers_51676,anime_49100,0.000944


In [7]:
# # Per valutare i risultati Fusion Scores
# Rename the columns of fusion_scores to match PyTerrier's expected run format
fusion_scores_eval = fusion_scores.rename(columns={'query_id': 'qid', 'doc_id': 'docno', 'rrf': 'score'})

# Now evaluate the fused run using PyTerrier's Evaluate function
eval_fusion = pt.Evaluate(fusion_scores_eval, qrels_val_df, metrics=["P_1", "recall_100", "map_cut_100", "ndcg_cut_3"])
print("Fusion Evaluation:", eval_fusion)

Fusion Evaluation: {'P_1': 0.673469387755102, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.7243080618176241, 'ndcg_cut_3': 0.7171997808309056}


The core idea behind RRF is that each document receives a score inversely proportional to its rank in each system. These scores are then summed across systems to obtain a final ranking. This method is robust because it relies solely on the rank positions rather than the raw scores, which might be on different scales.

## Assigning Ranks to Results

The assign_rank function checks whether the input DataFrame already has a "rank" column. If not, it computes the rank for each query group (grouped by "qid") based on the "score" column, ordering in descending order (so that the highest score gets rank 1).

## Computing the RRF Score

The compute_rrf function takes a DataFrame and calculates the RRF score for each document using the formula:
                         RRF= 1/(𝑘+rank).

Here,𝑘 is a constant (typically set to 60) that helps control the influence of the rank. Lower ranks (i.e., better-ranked documents) receive higher RRF scores.

## Processing Results from Multiple Systems

For each retrieval system , we assume we have a corresponding DataFrame (e.g., bm25_df, minilm_df, etc.) that contains at least:
- qid: Query identifier;
- docno: Document identifier;
- score: The retrieval score provided by that system;
The compute_rrf function is applied to each of these DataFrames, producing a new DataFrame that includes the RRF scores.

## Fusion of the RRF Scores

All the RRF DataFrames are concatenated into one DataFrame. This combined DataFrame now contains rows from all systems.
Then, we group the combined results by the query and document identifiers (qid and docno) and sum the RRF scores for each group. This gives a final fusion score for each document per query.
Finally, the results are sorted by query and in descending order of the final fusion score.​

## Why Use This Approach?

- **Robustness**: RRF does not depend on the absolute values of the scores from different systems but rather on the rank positions. This makes it effective when the underlying systems use different scoring scales.
- **Consensus Boost**: Documents that consistently appear at high ranks across multiple systems will accumulate higher scores, thus improving their final rank.
- **Simplicity**: RRF is simple to implement and does not require additional training data.

#  Weighted sum of individual scores

In [8]:
# Rename the 'score' columns in each DataFrame to avoid conflicts
bm25_df = results_bm25.rename(columns={'score': 'bm25_score'})
minilm_df = results_bm25_minilm.rename(columns={'score': 'minilm_score'})
distilbert_df = results_bm25_distilbertqa.rename(columns={'score': 'distilbert_score'})
t5_df = results_bm25_t5.rename(columns={'score': 't5_score'})

In [9]:
# Merge the DataFrames on 'qid' and 'docno'. Use an outer join to capture all documents.
fusion_df = pd.merge(bm25_df, minilm_df, on=['qid', 'docno'], how='outer')
fusion_df = pd.merge(fusion_df, distilbert_df, on=['qid', 'docno'], how='outer')
fusion_df = pd.merge(fusion_df, t5_df, on=['qid', 'docno'], how='outer')

# Fill missing scores with 0
score_cols = ['bm25_score', 'minilm_score', 'distilbert_score', 't5_score']
fusion_df[score_cols] = fusion_df[score_cols].fillna(0)

  fusion_df = pd.merge(fusion_df, t5_df, on=['qid', 'docno'], how='outer')


In [10]:
from sklearn.preprocessing import MinMaxScaler

score_cols = ['bm25_score', 'minilm_score', 'distilbert_score', 't5_score']
scaler = MinMaxScaler()

# Fit and transform the score columns directly.
fusion_df[score_cols] = scaler.fit_transform(fusion_df[score_cols])
fusion_df[score_cols]

Unnamed: 0,bm25_score,minilm_score,distilbert_score,t5_score
0,0.333146,0.506547,0.873602,0.337811
1,0.326179,0.660269,0.879819,0.330995
2,0.298810,0.048435,0.895631,0.304219
3,0.290072,0.447950,0.000000,0.295671
4,0.258512,0.654606,0.901778,0.264795
...,...,...,...,...
105427,0.049300,0.048435,0.000000,0.000000
105428,0.049295,0.048435,0.000000,0.000000
105429,0.049279,0.048435,0.000000,0.000000
105430,0.049276,0.048435,0.000000,0.000000


In [11]:
# Define weights based on your evaluation:
w_bm25 = 0.2
w_minilm = 0.15
w_distilbert = 0.15
w_t5 = 0.5

# Compute the final fused score as a weighted sum of individual scores.
fusion_df['final_score'] = (
    w_bm25 * fusion_df['bm25_score'] +
    w_minilm * fusion_df['minilm_score'] +
    w_distilbert * fusion_df['distilbert_score'] +
    w_t5 * fusion_df['t5_score']
)

In [12]:
# For each query, sort the documents by final_score in descending order.
fusion_df = fusion_df.sort_values(['qid', 'final_score'], ascending=[True, False])

# For evaluation with PyTerrier, rename 'final_score' to 'score'
fusion_df_eval = fusion_df.rename(columns={'final_score': 'score'})

In [13]:
print("Final fused retrieval results:")
fusion_df_eval[['qid', 'docno', 'score']]


Final fused retrieval results:


Unnamed: 0,qid,docno,score
1,academia_143743,academia_12035,0.461746
0,academia_143743,academia_28111,0.442557
8,academia_143743,workplace_9502,0.433538
12,academia_143743,academia_122342,0.420733
5,academia_143743,workplace_102543,0.418283
...,...,...,...
84427,writers_51676,martialarts_7575,0.019931
84428,writers_51676,scifi_22958,0.019926
84429,writers_51676,movies_17810,0.019899
84430,writers_51676,anime_49100,0.019887


In [14]:
# # Per valutare i risultati Fusion Scores
# Rename the columns of fusion_scores to match PyTerrier's expected run format
# fusion_scores_eval = fusion_scores.rename(columns={'query_id': 'qid', 'doc_id': 'docno', 'rrf': 'score'})

fusion_df_eval = fusion_df.rename(columns={'final_score': 'score'})

# Now evaluate the fused run using PyTerrier's Evaluate function
eval_fusion = pt.Evaluate(fusion_df_eval, qrels_val_df, metrics=["P_1", "recall_100", "map_cut_100", "ndcg_cut_3"])
print("Weighted Fusion Evaluation:", eval_fusion)

Weighted Fusion Evaluation: {'P_1': 0.6938775510204082, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.763168425892842, 'ndcg_cut_3': 0.7735642605685159}


- w_bm25 = 0.5, w_minilm = 0.15, w_distilbert = 0.15, w_t5 = 0.2
Weighted Fusion Evaluation: {'P_1': 0.6938775510204082, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.7617532355156883, 'ndcg_cut_3': 0.77599426559767}
- w_bm25 = 0.4, w_minilm = 0.15, w_distilbert = 0.15, w_t5 = 0.3
Weighted Fusion Evaluation: {'P_1': 0.7040816326530612, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.769784479719235, 'ndcg_cut_3': 0.7773302834912562}
- w_bm25 = 0.2, w_minilm = 0.15, w_distilbert = 0.15, w_t5 = 0.5
Weighted Fusion Evaluation: {'P_1': 0.7142857142857143, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.7750774855809884, 'ndcg_cut_3': 0.7810963064139965}


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=144ec01f-394f-474f-b507-b786ab13b472' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>