In [1]:
import torch
import pyterrier as pt
from pyterrier.measures import nDCG, RR, MAP


if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
dataset = pt.get_dataset("irds:beir/trec-covid")
bm25 = pt.BatchRetrieve("../data/trec-covid", wmodel="BM25")

In [22]:
import pandas as pd
reduced_topics = pd.read_csv("./reduced_queries/reduced_trec_covid.csv", usecols=[1,2])
reduced_topics["qid"] = reduced_topics["qid"].astype(str)


In [23]:
def remove_non_alphanumeric(text):
    return ''.join(ch if ch.isalnum() or ch == " " else "" for ch in text)

# Apply the function to the entire column
reduced_topics['query'] = reduced_topics['query'].apply(lambda x: remove_non_alphanumeric(x))
reduced_topics['query'] = reduced_topics['query'].str.lower()

reduced_topics

Unnamed: 0,qid,query
0,1,origin of covid 19
1,2,how coronaviruses respond to weather changes
2,3,do sars cov 2 infected people develop immunity...
3,4,causes of death from covid 19
4,5,drugs active against sars cov or sars cov 2 in...
5,6,rapid testing for covid 19
6,7,serological tests for coronavirus antibodies
7,8,lack of testing availability and underreportin...
8,9,covid 19 in canada
9,10,social distancing and its impact on slowing do...


In [3]:
topics = dataset.get_topics(variant="narrative") # Change to 'text' for the original topics and 'query' for keyword queries

In [6]:
from pyterrier.measures import nDCG, RR, MAP


pt.Experiment(
    [bm25],
    dataset.get_topics(variant="narrative"),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 1000],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@1000
0,BR(BM25),0.766,0.518433,0.142459


In [7]:
from pyterrier.measures import nDCG, RR, MAP

pt.Experiment(
    [bm25],
    reduced_topics,
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 1000],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@1000
0,BR(BM25),0.654024,0.414171,0.116387


## Artic-Embed

In [8]:
from fast_forward.encoder import TransformerEncoder
import torch

class SnowFlakeQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    query_prefix = 'Represent this sentence for searching relevant passages: '
    queries_with_prefix = ["{}{}".format(query_prefix, i) for i in texts]
    query_tokens = self.tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)

    query_tokens.to(self.device)
    self.model.eval()

    #document_tokens =  self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Compute token embeddings
    with torch.no_grad():
        query_embeddings = self.model(**query_tokens)[0][:, 0]
        #doument_embeddings = self.model(**document_tokens)[0][:, 0]

    # normalize embeddings
    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
    #doument_embeddings = torch.nn.functional.normalize(doument_embeddings, p=2, dim=1)
    return query_embeddings.detach().cpu().numpy()
  
q_encoder_artic = SnowFlakeQueryEncoder('Snowflake/snowflake-arctic-embed-m')

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_artic = OnDiskIndex.load(
    Path("../datam/ffindex_trec-covid_snowflake.h5"), query_encoder=q_encoder_artic, mode=Mode.MAXP
)

100%|██████████| 171332/171332 [00:00<00:00, 622878.89it/s]


In [10]:
ff_index_artic = ff_index_artic.to_memory()

In [11]:
from fast_forward.util.pyterrier import FFScore

ff_score_artic = FFScore(ff_index_artic)

In [12]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val-min_val)


In [24]:
pl_artic = bm25 % 1000 >> ff_score_artic
d_artic_red = pl_artic.transform(reduced_topics)

normalize_column(d_artic_red, "score")
normalize_column(d_artic_red, "score_0")

In [19]:
d_artic = pl_artic.transform(dataset.get_topics(variant="query"))
normalize_column(d_artic, "score")
normalize_column(d_artic, "score_0")

Original Queries

In [20]:
from fast_forward.util.pyterrier import FFInterpolate

pt.Experiment(
    [d_artic >> FFInterpolate(alpha=0)], # Change alpha value
    dataset.get_topics(variant="query"),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
)


  [d_artic >> FFInterpolate(alpha=0)],


Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25 >> FF,0.8925,0.65771,0.101486


Reduced Queries

In [25]:
pt.Experiment(
    [d_artic_red >> FFInterpolate(alpha=0)], # Change alpha value
    reduced_topics,
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
)

  [d_artic_red >> FFInterpolate(alpha=0)],


Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,BM25 >> FF,0.97,0.79011,0.11827
