In [1]:
#Install dependencies
%pip install pandas numpy pyserini fast-forward-indexes==0.2.0  ir_measures ir_datasets faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyserini
  Downloading pyserini-0.35.0-py3-none-any.whl (168.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.6/168.6 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting fast-forward-indexes==0.2.0
  Downloading fast_forward_indexes-0.2.0-py3-none-any.whl (25 kB)
Collecting ir_measures
  Using cached ir_measures-0.3.3-py3-none-any.whl
Coll

Dense retrieval with TCT Colbert

In [2]:
import pandas as pd
import numpy as np
import ir_datasets
import ir_measures
from ir_measures import *
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import json
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from fast_forward import Ranking
from pathlib import Path

In [None]:
# Download dataset
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip"
data_path = util.download_and_unzip(url, 'fiqa')
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

In [None]:
#Convert dataset to Pyserini expected format
input_file = '/fiqa/fiqa/corpus.jsonl'  
output_file = '/fiqa/fiqa/corpus-converted.jsonl'  

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        entry = json.loads(line)
        contents_with_newline = entry["text"] + "\n" 
        new_entry = {
            "id": entry["_id"],
            "contents": contents_with_newline
        }
        outfile.write(json.dumps(new_entry) + '\n')

In [None]:
#Encode the converted corpus in the terminal using TCT-ColBERT encoder
python3 -m pyserini.encode 
  input   --corpus fiqa/fiqa/corpus-converted.jsonl       
          --fields text      
          --delimiter "\n" 
          --shard-id 0
          --shard-num 1 
  output  --embeddings fiqa/embd
          --to-faiss 
  encoder --encoder castorini/tct_colbert-msmarco 
          --fields text 
          --batch 32 
          --fp16

In [None]:
#Create dense index using encoded corpus
python3 -m pyserini.index.faiss --input fiqa/embd --output fiqa/dense_index

In [3]:
def generate_df(searcher, dataset):
    query_ids = []
    doc_ids = []
    scores = []
    for query in dataset.queries_iter():
        hits = searcher.search(query[1],k=1000)
        for hit in hits:
            query_ids.append(query[0])
            doc_ids.append(hit.docid)
            scores.append(hit.score)
    df = pd.DataFrame({
        'q_id': query_ids,
        'id': doc_ids,
        'score': scores
    })
    return df

In [None]:
encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
#Create searcher for dense retrieval
dsearcher = FaissSearcher('fiqa/dense_index',encoder)
devset_name = 'irds:beir/fiqa/dev'
devset = ir_datasets.load(devset_name)
df = generate_df(dsearcher,devset)
r = Ranking(df,name = 'TCT')
#Save dense retrieval Results
r.save(Path('trec-run-tct-fiqa-dev.txt'))

In [None]:
#Generate test set results for dense retrieval
testset_name = 'irds:beir/fiqa/test'
testset = ir_datasets.load(testset_name)
df = generate_df(dsearcher,testset)
r = Ranking(df,name = 'TCT')
r.save(Path('trec-run-tct-fiqa-test.txt'))

Hybrid Retrieval

In [None]:
#Hyperparameter turning without normalizing the scores
alpha_range = [0.05, 0.1, 0.25,0.5, 0.75,0.9] 
dr = Ranking.from_file(Path('trec-run-tct-fiqa-dev.txt'))
sr = Ranking.from_file(Path('trec-run-bm25_fiqa-dev.txt'))
for alpha in alpha_range:
    hybrid = sr.interpolate(dr,alpha)
    hybrid.save(Path('trec-run-bm25-tct-fiqa-dev-int-'+str(alpha) +'.txt'))

In [None]:
path = 'trec-run-bm25-tct-fiqa-dev-int-'
results = []
for alpha in alpha_range:
    full_path = path+str(alpha)+'.txt'
    run = ir_measures.read_trec_run(full_path)
    qrels = devset.qrels_iter()
    res = ir_measures.calc_aggregate([RR @ 10, nDCG @ 10, MAP @ 100, R@ 100, AP @ 10], qrels, run)
    results.append(res)
df = pd.DataFrame(results, index=[f'{alpha}' for alpha in alpha_range])
df.to_csv('hybrid-original-fiqa-scores-alpha-tuning.csv')
best_alpha_idx = df.iloc[:, 4].idxmax()
best_original = float(best_alpha_idx)

In [21]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val - min_val)

In [22]:
#Hyperparameter tuning for hybrid retrieval with normalized scores
ddf = dr._df
normalize_column(ddf,'score')
sdf = sr._df
normalize_column(sdf,'score')
for alpha in alpha_range:
    hybrid = sr.interpolate(dr,alpha)
    hybrid.save(Path('trec-run-bm25-tct-fiqa-dev-int-'+str(alpha) +'-normalized.txt'))

In [None]:
path = 'trec-run-bm25-tct-fiqa-dev-int-'
results = []
for alpha in alpha_range:
    full_path = path+str(alpha)+'-normalized.txt'
    run = ir_measures.read_trec_run(full_path)
    qrels = devset.qrels_iter()
    res = ir_measures.calc_aggregate([RR @ 10, nDCG @ 10, MAP @ 100, R@ 100, AP @ 10], qrels, run)
    results.append(res)
df = pd.DataFrame(results, index=[f'{alpha}' for alpha in alpha_range])
df.to_csv('hybrid-original-fiqa-scores-alpha-tuning-normalized.csv')
best_normalized = float(df.iloc[:, 4].idxmax())


In [24]:
#Manipulation of dataframes to implement missing score techniques
def deal_with_missing_scores_and_interpolate(dense_path, sparse_path, method, alpha, normalize):
    dense_ranking = Ranking.from_file(Path(dense_path))
    sparse_ranking = Ranking.from_file(Path(sparse_path))
    full = None
    if (method == 'zero'):
        merged = dense_ranking._df.merge(sparse_ranking._df, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(0)
        merged['score_sparse'] = merged['score_sparse'].fillna(0)
        full = merged
    if (method == 'average'):
        merged = dense_ranking._df.merge(sparse_ranking._df, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(merged["score_dense"].mean())
        merged['score_sparse'] = merged['score_sparse'].fillna(merged["score_sparse"].mean())
        full = merged
    if (method == 'drop'):
        merged = dense_ranking._df.merge(sparse_ranking._df, on=["q_id", "id"],how = 'inner', suffixes=['_dense', "_sparse"])
        full = merged
    if (method == 'median'):
        merged = dense_ranking._df.merge(sparse_ranking._df, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(merged["score_dense"].median())
        merged['score_sparse'] = merged['score_sparse'].fillna(merged["score_sparse"].median())
        full = merged
    if normalize:
        normalize_column(full,'score_dense')
        normalize_column(full,'score_sparse')
    full["score"] = alpha * full['score_sparse'] + (1 - alpha) * full['score_dense']
    result = Ranking(
            full,
            name='hybrid',
            dtype=np.float32,
            copy=False,
            is_sorted=False,
        )
    return result

In [None]:
missing_scores = ['average','median','zero','drop']
dense_test = 'trec-run-tct-fiqa-test.txt'
sparse_test = 'trec-run-bm25_fiqa-test-0.1.txt'
for score in missing_scores:
    r = deal_with_missing_scores_and_interpolate(dense_test, sparse_test,score,best_original,False)
    r.save(Path('trec-run-bm25-tct-fiqa-test-int-'+str(best_original)+'-'+score+'.txt'))

In [26]:
path = 'trec-run-bm25-tct-fiqa-test-int-'+str(best_original)+'-'
results = []
for score in missing_scores:
    full_path = path+score+'.txt'
    run = ir_measures.read_trec_run(full_path)
    qrels = testset.qrels_iter()
    res = ir_measures.calc_aggregate( [RR(rel=2) @ 10, nDCG @ 10, MAP(rel=2)@10, R(rel=2)@ 100, AP(rel=2) @ 10], qrels, run)
    results.append(res)
df = pd.DataFrame(results, index=[f'Score {score}' for score in missing_scores])
df.to_csv('hybrid-original-fiqa-missing-score '+str(best_original)+'-test-results.csv')

In [None]:
for score in missing_scores:
    r = deal_with_missing_scores_and_interpolate(dense_test, sparse_test,score,best_normalized,True)
    r.save(Path('trec-run-bm25-tct-fiqa-test-int-'+str(best_normalized)+'-norm-'+score+'.txt'))

In [28]:
path = 'trec-run-bm25-tct-fiqa-test-int-'+str(best_normalized)+'-norm-'
results = []
for score in missing_scores:
    full_path = path+score+'.txt'
    run = ir_measures.read_trec_run(full_path)
    qrels = testset.qrels_iter()
    res = ir_measures.calc_aggregate( [RR @ 10, nDCG @ 10,  R@ 100], qrels, run)
    results.append(res)
df = pd.DataFrame(results, index=[f'Score {score}' for score in missing_scores])
df.to_csv('hybrid-fiqa-norm-missing-score '+str(best_normalized)+'-test-results.csv')