In [1]:
import torch
import pyterrier as pt
from fast_forward.util.pyterrier import FFInterpolate, FFScore
from pyterrier.measures import RR, nDCG, MAP
from fast_forward.encoder import TransformerEncoder
from fast_forward import OnDiskIndex, Mode
from pathlib import Path
import pandas as pd

if not pt.started():
    pt.init(tqdm="notebook")


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
dataset = pt.get_dataset("irds:beir/hotpotqa/test")
devset = pt.get_dataset("irds:beir/hotpotqa/dev")

In [4]:
bm25 = pt.BatchRetrieve("../data/mount/hotpotqa", wmodel="BM25")

In [3]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val-min_val)

In [30]:
sparse = bm25.transform(dataset.get_topics())

In [32]:
sparse_dev = bm25.transform(devset.get_topics())

In [4]:
sparse = pd.read_csv("hotpot_bm25.csv")
sparse_dev = pd.read_csv("hotpot_bm25_dev.csv")

In [27]:
from pyterrier.measures import RR, nDCG, MAP

pt.Experiment(
    [~bm25],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 1000],
)

Unnamed: 0,name,RR@10,nDCG@10,AP@1000
0,Cache(BR(BM25)),0.662417,0.512828,0.4344


## Snowflake-artic-embed

In [7]:
class SnowFlakeQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    query_prefix = 'Represent this sentence for searching relevant passages: '
    queries_with_prefix = ["{}{}".format(query_prefix, i) for i in texts]
    query_tokens = self.tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)

    query_tokens.to(self.device)
    self.model.eval()

    #document_tokens =  self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Compute token embeddings
    with torch.no_grad():
        query_embeddings = self.model(**query_tokens)[0][:, 0]
        #doument_embeddings = self.model(**document_tokens)[0][:, 0]

    # normalize embeddings
    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
    #doument_embeddings = torch.nn.functional.normalize(doument_embeddings, p=2, dim=1)
    return query_embeddings.detach().cpu().numpy()

In [8]:
q_encoder_artic = SnowFlakeQueryEncoder('Snowflake/snowflake-arctic-embed-m')

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_artic = OnDiskIndex.load(
    Path("../datam/ffindex_hotpot_snowflake.h5"), query_encoder=q_encoder_artic, mode=Mode.MAXP
)

100%|██████████| 5233329/5233329 [00:06<00:00, 833610.86it/s] 


In [10]:
ff_index_artic = ff_index_artic.to_memory()

: 

In [9]:
d_artic = pd.read_csv("artic_hotpot_scores_norm.csv")
d_artic_dev = pd.read_csv("artic_hotpot_scores_dev_norm.csv")

In [10]:
pt.Experiment(
      [pt.Transformer.from_df(d_artic) >> FFInterpolate(alpha=0.1)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic"],
  )

Unnamed: 0,name,nDCG@10
0,Artic,0.718133


In [11]:
alphas = [0, 0.025, 0.05, 0.1, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_artic_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.7414483030131003 0.3


Optimal alpha for Artic only

In [12]:
pt.Experiment(
      [pt.Transformer.from_df(d_artic) >> FFInterpolate(alpha=0.3)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic"],
  )

Unnamed: 0,name,nDCG@10
0,Artic,0.725542


## BGE

In [10]:
class BGEQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    encoded_input.to(self.device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = self.model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings.detach().cpu().numpy()

In [11]:
q_encoder_bge = BGEQueryEncoder('BAAI/bge-base-en-v1.5')



In [8]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_bge = OnDiskIndex.load(
    Path("../bge/ffindex_arguana_bge_base_en_v1_5.h5"), query_encoder=q_encoder_bge, mode=Mode.MAXP
)

100%|██████████| 8674/8674 [00:00<00:00, 1563379.01it/s]


In [9]:
ff_index_bge = ff_index_bge.to_memory()

In [5]:
d_bge = pd.read_csv("bge_hotpot_scores_norm.csv")
d_bge_dev = pd.read_csv("bge_hotpot_scores_dev_norm.csv")

Optimal alpha BGE only

In [16]:
alphas = [0, 0.025, 0.05, 0.1, 0.2, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_bge_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.7169611923112339 0.5


In [17]:
pt.Experiment(
      [pt.Transformer.from_df(d_bge) >> FFInterpolate(alpha=optimal_alpha)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )

Unnamed: 0,name,nDCG@10
0,BGE,0.695725


Artic + BGE

In [18]:
sc_artic = pt.Transformer.from_df(d_artic)
sc_bge = pt.Transformer.from_df(d_bge)

In [19]:
sc_artic_dev = pt.Transformer.from_df(d_artic_dev)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

In [43]:
sparse_dev.dtypes

qid       object
docid      int64
docno     object
rank       int64
score    float64
query     object
dtype: object

In [69]:
d_bge_dev.dtypes

Unnamed: 0      int64
qid            object
docno           int64
score_0       float64
score         float64
query          object
dtype: object

In [13]:
sc_sparse_dev = pt.Transformer.from_df(sparse_dev)
sc_artic_dev = pt.Transformer.from_df(d_artic_dev)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

In [15]:
combinations = [(0, 0.5, 0.5), (0.05, 0.425, 0.425), (0.2, 0.4, 0.4), 
                (0.1, 0.2, 0.7), (0.1, 0.7, 0.2), (0, 0.3, 0.7),
                (0.1, 0.4, 0.5), (0.1, 0.5, 0.4), (0, 0.7, 0.3),
                (0.05, 0.45, 0.5), (0.05, 0.5, 0.45), (0.025, 0.275, 0.7),
                (0.025, 0.7, 0.275)]

max_comb = [0, 0, 0]
max_score = 0

for combination in combinations:
  exp = pt.Experiment(
      [combination[0] * sc_sparse_dev + combination[1] * sc_artic_dev + combination[2] * sc_bge_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
  )
  if exp["nDCG@10"].values[0] > max_score:
    max_score = exp["nDCG@10"].values[0]
    max_comb = combination

print(f"Best alpha for bm25: {max_comb[0]}, artic: {max_comb[1]} and bge: {max_comb[2]}")


Best alpha for bm25: 0.2, artic: 0.4 and bge: 0.4


In [26]:
pt.Experiment(
  [0.25 * sc_sparse_dev + 0.5 * sc_artic_dev + 0.25 * sc_bge_dev],
  devset.get_topics(),
  devset.get_qrels(),
  eval_metrics=[nDCG @ 10],
  names=["Artic + BGE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.752114


In [19]:
sc_sparse = pt.Transformer.from_df(sparse)
sc_artic = pt.Transformer.from_df(d_artic)
sc_bge = pt.Transformer.from_df(d_bge)

In [27]:
pt.Experiment(
   [0.25 * sc_sparse + 0.5 * sc_artic + 0.25 * sc_bge],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[nDCG @ 10],
    names=["Artic + BGE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.734033


## GTE

In [14]:
from transformers import AutoModel, AutoTokenizer
from pathlib import Path
from typing import Callable, Sequence, Union
from fast_forward.encoder import Encoder
import numpy as np

## Need to override TransformerEncoder to include trust_remote_code=True in the 
## from_pretrained() call since GTE requires it
class TransformerEncoder(Encoder):
    """Uses a pre-trained transformer model for encoding. Returns the pooler output."""

    def __init__(
        self, model: Union[str, Path], device: str = "cpu", **tokenizer_args
    ) -> None:
        """Create a transformer encoder.

        Args:
            model (Union[str, Path]): Pre-trained transformer model (name or path).
            device (str, optional): PyTorch device. Defaults to "cpu".
            **tokenizer_args: Additional tokenizer arguments.
        """
        super().__init__()
        self.model = AutoModel.from_pretrained(model, trust_remote_code=True)
        self.model.to(device)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.device = device
        self.tokenizer_args = tokenizer_args

    def __call__(self, texts: Sequence[str]) -> np.ndarray:
        inputs = self.tokenizer(texts, return_tensors="pt", **self.tokenizer_args)
        inputs.to(self.device)
        embeddings = self.model(**inputs).pooler_output.detach().cpu().numpy()
        return embeddings


In [15]:
class GTEQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    batch_dict = self.tokenizer(texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
    batch_dict.to(self.device)
    
    with torch.no_grad():
      outputs = self.model(**batch_dict)
      embeddings = outputs.last_hidden_state[:, 0]
    return embeddings.detach().cpu().numpy()

In [16]:
q_encoder_gte = GTEQueryEncoder('Alibaba-NLP/gte-base-en-v1.5')



In [6]:
d_gte = pd.read_csv("gte_hotpot_scores_norm.csv")
d_gte_dev = pd.read_csv("gte_hotpot_scores_dev_norm.csv")

Optimal alpha GTE only

In [18]:
alphas = [0, 0.025, 0.05, 0.1, 0.2, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_gte_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.6990942194700993 0.5


In [19]:
pt.Experiment(
      [pt.Transformer.from_df(d_gte) >> FFInterpolate(alpha=optimal_alpha)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )

Unnamed: 0,name,nDCG@10
0,BGE,0.686382


In [20]:
sc_artic = pt.Transformer.from_df(d_artic)
sc_bge = pt.Transformer.from_df(d_bge)

sc_artic_dev = pt.Transformer.from_df(d_artic_dev)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

In [7]:
sc_bge = pt.Transformer.from_df(d_bge)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

sc_gte = pt.Transformer.from_df(d_gte)
sc_gte_dev = pt.Transformer.from_df(d_gte_dev)

In [8]:
sc_sparse_dev = pt.Transformer.from_df(sparse_dev)
sc_sparse = pt.Transformer.from_df(sparse)

Artic + GTE

In [36]:
combinations = [(0, 0.5, 0.5), (0.05, 0.425, 0.425), (0.2, 0.4, 0.4), 
                (0.1, 0.2, 0.7), (0.1, 0.7, 0.2), (0, 0.3, 0.7),
                (0.1, 0.4, 0.5), (0.1, 0.5, 0.4), (0, 0.7, 0.3),
                (0.05, 0.45, 0.5), (0.05, 0.5, 0.45), (0.025, 0.275, 0.7),
                (0.025, 0.7, 0.275)]

max_comb = [0, 0, 0]
max_score = 0

for combination in combinations:
  exp = pt.Experiment(
      [combination[0] * sc_sparse_dev + combination[1] * sc_artic_dev + combination[2] * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + GTE"],
  )
  if exp["nDCG@10"].values[0] > max_score:
    max_score = exp["nDCG@10"].values[0]
    max_comb = combination

print(f"Best alpha for bm25: {max_comb[0]}, artic: {max_comb[1]} and gte: {max_comb[2]}")


Best alpha for bm25: 0.1, artic: 0.7 and gte: 0.2


In [35]:
pt.Experiment(
  [0.075 * sc_sparse_dev + 0.725 * sc_artic_dev + 0.2 * sc_gte_dev],
  devset.get_topics(),
  devset.get_qrels(),
  eval_metrics=[nDCG @ 10],
  names=["Artic + GTE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + GTE,0.74802


In [34]:
pt.Experiment(
   [0.75 * sc_sparse + 0.725 * sc_artic + 0.2 * sc_gte],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[nDCG @ 10],
    names=["Artic + GTE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + GTE,0.731014


BGE + GTE

In [37]:
combinations = [(0, 0.5, 0.5), (0.05, 0.425, 0.425), (0.2, 0.4, 0.4), 
                (0.1, 0.2, 0.7), (0.1, 0.7, 0.2), (0, 0.3, 0.7),
                (0.1, 0.4, 0.5), (0.1, 0.5, 0.4), (0, 0.7, 0.3),
                (0.05, 0.45, 0.5), (0.05, 0.5, 0.45), (0.025, 0.275, 0.7),
                (0.025, 0.7, 0.275)]

max_comb = [0, 0, 0]
max_score = 0

for combination in combinations:
  exp = pt.Experiment(
      [combination[0] * sc_sparse_dev + combination[1] * sc_bge_dev + combination[2] * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + GTE"],
  )
  if exp["nDCG@10"].values[0] > max_score:
    max_score = exp["nDCG@10"].values[0]
    max_comb = combination

print(f"Best alpha for bm25: {max_comb[0]}, bge: {max_comb[1]} and gte: {max_comb[2]}")


Best alpha for bm25: 0.2, bge: 0.4 and gte: 0.4


In [14]:
pt.Experiment(
  [0.25 * sc_sparse_dev + 0.5 * sc_bge_dev + 0.25 * sc_gte_dev],
  devset.get_topics(),
  devset.get_qrels(),
  eval_metrics=[nDCG @ 10],
  names=["BGE + GTE"],
)

Unnamed: 0,name,nDCG@10
0,BGE + GTE,0.728803


In [15]:
pt.Experiment(
   [0.25 * sc_sparse + 0.5 * sc_bge + 0.25 * sc_gte],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[nDCG @ 10],
    names=["BGE + GTE"],
)

Unnamed: 0,name,nDCG@10
0,BGE + GTE,0.712162


Artic + BGE + GTE

In [50]:

combinations = [(0, 0.33, 0.33, 0.34), (0.05, 0.316, 0.316, 0.316), (0.025, 0.325, 0.325, 0.325), 
                (0.1, 0.1, 0.6, 0.1), (0.1, 0.6, 0.1, 0.1), (0.1, 0.1, 0.1, 0.6),
                (0.2, 0.2, 0.4, 0.2), (0.2, 0.4, 0.2, 0.2), (0.2, 0.2, 0.2, 0.4),
                (0.025, 0.175, 0.5, 0.3), (0.025, 0.5, 0.175, 0.3), (0.025, 0.3, 0.175, 0.5),
                (0.025, 0.5, 0.3, 0.175), (0.0025, 0.3, 0.5, 0.1975), (0.0025, 0.5, 0.3, 0.1975), (0.0025, 0.1975, 0.5, 0.3),
                (0.0025, 0.5, 0.1975, 0.3), (0.0025, 0.3, 0.1975, 0.5), (0.0025, 0.1975, 0.3, 0.5)]

max_comb = [0, 0, 0, 0]
max_score = 0

for combination in combinations:
  exp = pt.Experiment(
      [combination[0] * sc_sparse_dev + combination[1] * sc_artic + combination[2] * sc_bge + combination[3] * sc_gte],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE + GTE"],
  )
  if exp["nDCG@10"].values[0] > max_score:
    max_score = exp["nDCG@10"].values[0]
    max_comb = combination

print(f"Best alpha for bm25: {max_comb[0]}, artic: {max_comb[1]} gte: {max_comb[3]} and bge: {max_comb[2]}")


Best alpha for bm25: 0.05, artic: 0.316 gte: 0.316 and bge: 0.316


In [64]:

pt.Experiment(
      [0.05 * sc_sparse_dev + 0.55 * sc_artic_dev + 0.3 * sc_bge_dev + 0.1 * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + BGE + GTE,0.747185


In [62]:

pt.Experiment(
      [0.05 * sc_sparse + 0.55  * sc_artic +  0.3  * sc_bge + 0.1 * sc_gte],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + BGE + GTE,0.730526
