In [1]:
import torch
import pyterrier as pt
from fast_forward.util.pyterrier import FFInterpolate, FFScore
from pyterrier.measures import RR, nDCG, MAP
from fast_forward.encoder import TransformerEncoder
from fast_forward import OnDiskIndex, Mode
from pathlib import Path
import pandas as pd

if not pt.started():
    pt.init(tqdm="notebook")
torch.cuda.is_available()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


True

In [2]:
dataset = pt.get_dataset("irds:beir/scifact/test")
devset= pt.get_dataset("irds:beir/scifact/train")

In [3]:
bm25 = pt.BatchRetrieve("../data/scifact", wmodel="BM25")



## Snowflake-artic-embed

In [4]:
class SnowFlakeQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    query_prefix = 'Represent this sentence for searching relevant passages: '
    queries_with_prefix = ["{}{}".format(query_prefix, i) for i in texts]
    query_tokens = self.tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)

    query_tokens.to(self.device)
    self.model.eval()

    #document_tokens =  self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Compute token embeddings
    with torch.no_grad():
        query_embeddings = self.model(**query_tokens)[0][:, 0]
        #doument_embeddings = self.model(**document_tokens)[0][:, 0]

    # normalize embeddings
    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
    #doument_embeddings = torch.nn.functional.normalize(doument_embeddings, p=2, dim=1)
    return query_embeddings.detach().cpu().numpy()

In [5]:
q_encoder_artic = SnowFlakeQueryEncoder('Snowflake/snowflake-arctic-embed-m')

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_artic = OnDiskIndex.load(
    Path("../datam/ffindex_scifact_snowflake_arctic_embed_m.h5"), query_encoder=q_encoder_artic, mode=Mode.MAXP
)

100%|██████████| 5183/5183 [00:00<00:00, 1100935.77it/s]


In [7]:
ff_index_artic = ff_index_artic.to_memory()

In [8]:
from fast_forward.util.pyterrier import FFScore

ff_score_artic = FFScore(ff_index_artic)

## BGE

In [9]:
class BGEQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    encoded_input.to(self.device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = self.model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings.detach().cpu().numpy()

In [10]:
q_encoder_bge = BGEQueryEncoder('BAAI/bge-base-en-v1.5')

In [11]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_bge = OnDiskIndex.load(
    Path("../bge/ffindex_scifact_bge_base_en_v1_5.h5"), query_encoder=q_encoder_bge, mode=Mode.MAXP
)

100%|██████████| 5183/5183 [00:00<00:00, 1350588.82it/s]


In [12]:
ff_index_bge = ff_index_bge.to_memory()

In [13]:
from fast_forward.util.pyterrier import FFScore

ff_score_bge = FFScore(ff_index_bge)

In [14]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val-min_val)


In [15]:
pl_scifact = ~bm25 % 1000 >> FFScore(ff_index_artic)
d_artic = pl_scifact.transform(dataset.get_topics())
normalize_column(d_artic, "score")
normalize_column(d_artic, "score_0")
#d_artic.to_csv("artic_scifact_scores_norm.csv")

In [16]:
pl_bge = ~bm25 % 1000 >> ff_score_bge
d_bge = pl_bge.transform(dataset.get_topics())
normalize_column(d_bge, "score")
normalize_column(d_bge, "score_0")
# d_bge.to_csv("bge_scifact_scores_norm.csv")

In [15]:
import pandas as pd 
## This breaks stuff for some reason
d_bge = pd.read_csv("bge_scifact_scores_norm.csv", usecols=[1,2,3,4,5])
d_artic = pd.read_csv("artic_scifact_scores_norm.csv", usecols=[1,2,3,4,5])

In [20]:
pt.Experiment(
      [pt.Transformer.from_df(d_bge) >> FFInterpolate(alpha=0.1)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
    )

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.763907


In [21]:
from fast_forward.util.pyterrier import FFInterpolate


pl_artic = pt.Transformer.from_df(d_artic) >> FFInterpolate(alpha=0.1)
pl_bge = pt.Transformer.from_df(d_bge) >> FFInterpolate(alpha=0.1)

exp = pt.Experiment(
      [0.5 * pl_bge + 0.5 * pl_artic],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
    )

exp

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.767292


In [17]:
pl_scifact = ~bm25 % 1000 >> FFScore(ff_index_artic)
d_artic_dev = pl_scifact.transform(devset.get_topics())
normalize_column(d_artic_dev, "score")
normalize_column(d_artic_dev, "score_0")

In [11]:
d_artic_dev.to_csv("artic_scifact_scores_dev_norm.csv")

In [18]:
pl_bge = ~bm25 % 1000 >> ff_score_bge
d_bge_dev = pl_bge.transform(devset.get_topics())
normalize_column(d_bge_dev, "score")
normalize_column(d_bge_dev, "score_0")

In [18]:
d_bge_dev.to_csv("bge_scifact_scores_dev_norm.csv")

Optimal alpha only for artic

In [11]:
alphas = [0, 0.025, 0.05, 0.1, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_artic_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.7580980341877859 0.5


In [18]:
pt.Experiment(
      [pt.Transformer.from_df(d_artic) >> FFInterpolate(alpha=0.5)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.752191


Optimal alpha only for bge

In [11]:
alphas = [0, 0.025, 0.05, 0.1, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_bge_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.7848686813192597 0.3


In [16]:
pt.Experiment(
      [pt.Transformer.from_df(d_bge) >> FFInterpolate(alpha=optimal_alpha)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE"],
  )

Unnamed: 0,name,nDCG@10
0,BGE,0.769482


Artic + BGE

In [43]:
sc_artic = pt.Transformer.from_df(d_artic)
sc_bge = pt.Transformer.from_df(d_bge)

In [44]:
sc_artic_dev = pt.Transformer.from_df(d_artic_dev)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

In [97]:
pt.Experiment(
  [0.0025 * ~bm25 + 0.2975 * sc_artic_dev + 0.7 * sc_bge_dev],
    devset.get_topics(),
    devset.get_qrels(),
    eval_metrics=[nDCG @ 10],
    names=["Artic + BGE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.787193


In [98]:
pt.Experiment(
   [0.0025 * ~bm25 + 0.2975  * sc_artic + 0.7 * sc_bge],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[nDCG @ 10],
    names=["Artic + BGE"],
)

Unnamed: 0,name,nDCG@10
0,Artic + BGE,0.768832


## GTE

In [19]:
from transformers import AutoModel, AutoTokenizer
from pathlib import Path
from typing import Callable, Sequence, Union
from fast_forward.encoder import Encoder
import numpy as np

## Need to override TransformerEncoder to include trust_remote_code=True in the 
## from_pretrained() call since GTE requires it
class TransformerEncoder(Encoder):
    """Uses a pre-trained transformer model for encoding. Returns the pooler output."""

    def __init__(
        self, model: Union[str, Path], device: str = "cpu", **tokenizer_args
    ) -> None:
        """Create a transformer encoder.

        Args:
            model (Union[str, Path]): Pre-trained transformer model (name or path).
            device (str, optional): PyTorch device. Defaults to "cpu".
            **tokenizer_args: Additional tokenizer arguments.
        """
        super().__init__()
        self.model = AutoModel.from_pretrained(model, trust_remote_code=True)
        self.model.to(device)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.device = device
        self.tokenizer_args = tokenizer_args

    def __call__(self, texts: Sequence[str]) -> np.ndarray:
        inputs = self.tokenizer(texts, return_tensors="pt", **self.tokenizer_args)
        inputs.to(self.device)
        embeddings = self.model(**inputs).pooler_output.detach().cpu().numpy()
        return embeddings


In [20]:
class GTEQueryEncoder(TransformerEncoder):
  def __call__(self, texts):
    batch_dict = self.tokenizer(texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
    batch_dict.to(self.device)
    
    with torch.no_grad():
      outputs = self.model(**batch_dict)
      embeddings = outputs.last_hidden_state[:, 0]
    return embeddings.detach().cpu().numpy()

In [21]:
q_encoder_gte = GTEQueryEncoder('Alibaba-NLP/gte-base-en-v1.5')



In [22]:
from fast_forward import OnDiskIndex, Mode
from pathlib import Path

ff_index_gte = OnDiskIndex.load(
    Path("../gte/ffindex_scifact_gte_base_en_v1_5.h5"), query_encoder=q_encoder_gte, mode=Mode.MAXP
)

100%|██████████| 5183/5183 [00:00<00:00, 1547595.76it/s]


In [23]:
ff_index_gte = ff_index_gte.to_memory()

In [24]:
from fast_forward.util.pyterrier import FFScore

ff_score_gte = FFScore(ff_index_gte)

In [25]:
pl_gte = ~bm25 % 1000 >> ff_score_gte
d_gte = pl_gte.transform(dataset.get_topics())
normalize_column(d_gte, "score")
normalize_column(d_gte, "score_0")

In [12]:
d_gte.to_csv("gte_scifact_scores_norm.csv")

In [26]:
d_gte_dev = pl_gte.transform(devset.get_topics())
normalize_column(d_gte_dev, "score")
normalize_column(d_gte_dev, "score_0")

In [14]:
d_gte_dev.to_csv("gte_scifact_scores_dev_norm.csv")

In [27]:
alphas = [0, 0.025, 0.05, 0.1, 0.3, 0.5]
max_val = 0
optimal_alpha = -1 

for alpha in alphas:
  exp = pt.Experiment(
      [pt.Transformer.from_df(d_gte_dev) >> FFInterpolate(alpha=alpha)],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["GTE"],
  )
  if exp["nDCG@10"].values[0] > max_val:
    max_val = exp["nDCG@10"].values[0]
    optimal_alpha = alpha

print(max_val, optimal_alpha)

0.7638777927277236 0.5


In [28]:
pt.Experiment(
      [pt.Transformer.from_df(d_gte) >> FFInterpolate(alpha=optimal_alpha)],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["GTE"],
  )

Unnamed: 0,name,nDCG@10
0,GTE,0.76938


## Artic + BGE + GTE

In [27]:

sc_artic = pt.Transformer.from_df(d_artic)
sc_bge = pt.Transformer.from_df(d_bge)
sc_artic_dev = pt.Transformer.from_df(d_artic_dev)
sc_bge_dev = pt.Transformer.from_df(d_bge_dev)

In [28]:
sc_gte = pt.Transformer.from_df(d_gte)
sc_gte_dev = pt.Transformer.from_df(d_gte_dev)

In [64]:
sparse_dev = bm25.transform(devset.get_topics())
sparse = bm25.transform(dataset.get_topics())

In [6]:
sparse = pd.read_csv("scifact_bm25.csv")
sparse_dev = pd.read_csv("scifact_bm25_dev.csv")


In [67]:
sc_sparse_dev = pt.Transformer.from_df(sparse_dev)
sc_sparse = pt.Transformer.from_df(sparse)

In [80]:
pt.Experiment(
      [0.0025 * ~bm25 + 0.1975 * sc_artic_dev + 0.5 * sc_bge_dev + 0.3 * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + BGE + GTE,0.794025


In [81]:
pt.Experiment(
      [0.0025 * ~bm25 +  0.1975 * sc_artic + 0.5 * sc_bge + 0.3 * sc_gte],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + BGE + GTE,0.776488


## Artic + GTE

In [42]:

pt.Experiment(
      [0.0025 * ~bm25 + 0.3975 * sc_artic_dev + 0.6 * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + GTE,0.775759


In [41]:
pt.Experiment(
      [0.0025 * ~bm25 + 0.3975 * sc_artic + 0.6 * sc_gte],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["Artic + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,Artic + GTE,0.775692


## BGE + GTE

In [62]:
pt.Experiment(
      [0.0025 * ~bm25 + 0.7 * sc_bge_dev + 0.2975  * sc_gte_dev],
      devset.get_topics(),
      devset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,BGE + GTE,0.791285


In [63]:
pt.Experiment(
      [0.0025 * ~bm25 + 0.7 * sc_bge + 0.2975 * sc_gte],
      dataset.get_topics(),
      dataset.get_qrels(),
      eval_metrics=[nDCG @ 10],
      names=["BGE + GTE"],
  )

Unnamed: 0,name,nDCG@10
0,BGE + GTE,0.779082
