In [1]:
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
import ir_datasets
import pandas as pd
from fast_forward import Ranking
from pathlib import Path

Dense Retrieval

In [2]:
sample = pd.read_csv('queries.csv',names=['none','q_id','text'])

In [3]:
encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
dsearcher = FaissSearcher('/home/lucia/research_project/in4325-information-retrieval/intro-pyterrier/query-latency/dense_index_fiqa',encoder)

Generate results and add to dataframe

In [5]:
%%timeit -r 7 -n 100
query_ids = []
doc_ids = []
scores = []
rows = sample.iterrows()
next(rows)
for query in rows:
    hits = dsearcher.search(query[1].text,k=100)
    for hit in hits:
        query_ids.append(query[1].q_id)
        doc_ids.append(hit.docid)
        scores.append(hit.score)
df = pd.DataFrame({
    'q_id': query_ids,
    'id': doc_ids,
    'score': scores
})

10.9 s ± 198 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


Hybrid Search

In [6]:
def deal_with_missing_scores_and_interpolate(dense_ranking, sparse_ranking, method, alpha, normalize):
    full = None
    if (method == 'zero'):
        merged = dense_ranking.merge(sparse_ranking, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(0)
        merged['score_sparse'] = merged['score_sparse'].fillna(0)
        full = merged
    if (method == 'average'):
        merged = dense_ranking.merge(sparse_ranking, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(merged["score_dense"].mean())
        merged['score_sparse'] = merged['score_sparse'].fillna(merged["score_sparse"].mean())
        full = merged
    if (method == 'drop'):
        merged = dense_ranking.merge(sparse_ranking, on=["q_id", "id"],how = 'inner', suffixes=['_dense', "_sparse"])
        full = merged
    if (method == 'median'):
        merged = dense_ranking.merge(sparse_ranking, on=["q_id", "id"],how = 'outer', suffixes=['_dense', "_sparse"])
        merged['score_dense'] = merged['score_dense'].fillna(merged["score_dense"].median())
        merged['score_sparse'] = merged['score_sparse'].fillna(merged["score_sparse"].median())
        full = merged
    if normalize:
        normalize_column(full,'score_dense')
        normalize_column(full,'score_sparse')

    full["score"] = alpha * full['score_sparse'] + (1 - alpha) * full['score_dense']

    return full

In [7]:
def generate_df(searcher,sample):
    query_ids = []
    doc_ids = []
    scores = []
    rows = sample.iterrows()
    next(rows)
    for query in rows:
        hits = searcher.search(query[1].text,k=100)
        for hit in hits:
            query_ids.append(query[1].q_id)
            doc_ids.append(hit.docid)
            scores.append(hit.score)
    df = pd.DataFrame({
        'q_id': query_ids,
        'id': doc_ids,
        'score': scores
    })
    return df

In [None]:
dfd = generate_df(dsearcher,sample)
dfs = Ranking.from_file(Path('/home/lucia/research_project/in4325-information-retrieval/intro-pyterrier/query-latency/trec-run-bm25-fiqa-latency.txt'))._df

In [9]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'zero',0.1,False)

15.2 ms ± 660 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'drop',0.1,False)

4.91 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'median',0.1,False)

15.5 ms ± 348 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'average',0.1,False)

15.1 ms ± 225 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val - min_val)

In [14]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'drop',0.1,True)

5.39 ms ± 237 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'zero',0.1,True)

15.9 ms ± 725 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'median',0.1,True)

16.4 ms ± 278 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%%timeit -r 7 -n 100
deal_with_missing_scores_and_interpolate(dfd,dfs,'average',0.1,True)

15.5 ms ± 252 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
