In [11]:
import torch
import pyterrier as pt
from pyterrier.measures import nDCG, RR, MAP
from fast_forward import OnDiskIndex, Mode
from pathlib import Path
import pandas as pd 
if not pt.started():
    pt.init(tqdm="notebook")

In [14]:
dataset = pt.get_dataset("irds:beir/arguana")
bm25 = pt.BatchRetrieve("../data/beir_arguana", wmodel="BM25")



In [6]:
reduced_topics = pd.read_csv("./reduced_queries/reduced_queries_arguana.csv", usecols=["qid", "query"])
topics = dataset.get_topics()

In [15]:
pt.Experiment(
    [bm25],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 1000],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@1000
0,BR(BM25),0.240754,0.366152,0.251991


In [16]:
pt.Experiment(
    [bm25],
    reduced_topics,
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 1000],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@1000
0,BR(BM25),0.220898,0.333117,0.23314


In [17]:
from fast_forward.encoder import TransformerEncoder
import torch

class SnowFlakeQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    query_prefix = 'Represent this sentence for searching relevant passages: '
    queries_with_prefix = ["{}{}".format(query_prefix, i) for i in texts]
    query_tokens = self.tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)

    query_tokens.to(self.device)
    self.model.eval()

    #document_tokens =  self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Compute token embeddings
    with torch.no_grad():
        query_embeddings = self.model(**query_tokens)[0][:, 0]
        #doument_embeddings = self.model(**document_tokens)[0][:, 0]

    # normalize embeddings
    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
    #doument_embeddings = torch.nn.functional.normalize(doument_embeddings, p=2, dim=1)
    return query_embeddings.detach().cpu().numpy()
  
q_encoder_artic = SnowFlakeQueryEncoder('Snowflake/snowflake-arctic-embed-m')

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_artic = OnDiskIndex.load(
    Path("../datam/ffindex_arguana_snowflake_arctic_embed_m.h5"), query_encoder=q_encoder_artic, mode=Mode.MAXP
)

100%|██████████| 8674/8674 [00:00<00:00, 1743238.76it/s]


In [25]:
ff_index_artic = ff_index_artic.to_memory()

In [26]:
from fast_forward.util.pyterrier import FFScore

ff_score_artic = FFScore(ff_index_artic)

In [27]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val-min_val)


In [28]:
pl_artic = ~bm25 % 1000 >> ff_score_artic
d_artic_red = pl_artic.transform(reduced_topics)

normalize_column(d_artic_red, "score")
normalize_column(d_artic_red, "score_0")

In [30]:
d_artic = pl_artic.transform(dataset.get_topics())
normalize_column(d_artic, "score")
normalize_column(d_artic, "score_0")

Original Queries

In [46]:
from fast_forward.util.pyterrier import FFInterpolate
ff_int = FFInterpolate(alpha=0.8)

pt.Experiment(
    [d_artic >> ff_int],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
).to_csv(f"og_arguana_{ff_int.alpha}.csv")


  [d_artic >> ff_int],


Reduced Queries

In [52]:
ff_int = FFInterpolate(alpha=0.0)

pt.Experiment(
    [d_artic_red >> ff_int],
    reduced_topics,
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=["BM25 >> FF"],
).to_csv(f"reduced_arguana_{ff_int.alpha}.csv")

  [d_artic_red >> ff_int],
