In [2]:
import pandas as pd
import numpy as np

df2 = pd.read_csv('../Data/top_1000_BM_score_1.csv')

# Rank documents for retrieval

In [3]:
df2 = df2.sort_values(['qid', 'bm25_score'], ascending=False)
df2['rank_BM'] = df2.groupby('qid')['bm25_score'].rank(ascending=False).astype(int)

# Slect top k documents to rank

In [4]:
df2 = df2[df2['rank_BM']<=5].reset_index()

In [5]:
df2

Unnamed: 0.1,index,Unnamed: 0,qid,pid,query,passage,Q0,relevance,bm25_score,rank_BM
0,2907770,2907770,1102400,7287406,why do bears hibernate,"Why do Bears hibernate? March 31, 2010, Joan, ...",,0,9.723291,1
1,3047664,3047664,1102400,1171598,why do bears hibernate,There are a number of different reasons why br...,,0,8.420756,2
2,1140055,1140055,1102400,7968404,why do bears hibernate,Site Navigation. JGordon hooked me up with thi...,,0,7.832838,3
3,551264,551264,1102400,8196927,why do bears hibernate,5. Why do polar bears like to keep clean? Pola...,,0,7.641037,4
4,515512,515512,1102400,7738009,why do bears hibernate,2. What is the difference between brown bears ...,,0,7.599777,5
...,...,...,...,...,...,...,...,...,...,...
28729,385486,385486,2,6078653,Androgen receptor define,Define the terms sensation and perception and ...,,0,6.547841,1
28730,3847279,3847279,2,3672136,Androgen receptor define,• Class 11 Physics Demo. • habitat isolation e...,,0,6.201707,2
28731,948159,948159,2,5490907,Androgen receptor define,This behavior can be turned off by setting ver...,,0,6.005980,3
28732,724668,724668,2,2511437,Androgen receptor define,Define Volatile Fatty Acids (VFAs). Define Pol...,,0,5.973955,4


In [6]:
query1 = df2['query'].tolist()
passages1 = df2['passage'].tolist()

# Colbert Model

In [7]:
from transformers import AutoTokenizer
import torch
from transformers import AutoTokenizer, AutoModel, PreTrainedModel, PretrainedConfig
from transformers.configuration_utils import PretrainedConfig
from typing import Dict
from tqdm import tqdm

class ColBERTConfig(PretrainedConfig):
    model_type = "ColBERT"
    bert_model: str
    compression_dim: int = 768
    dropout: float = 0.0
    return_vecs: bool = False
    trainable: bool = True

class ColBERT(PreTrainedModel):
    """
    ColBERT model from: https://arxiv.org/pdf/2004.12832.pdf
    We use a dot-product instead of cosine per term (slightly better)
    """
    config_class = ColBERTConfig
    base_model_prefix = "bert_model"

    def __init__(self,
                 cfg) -> None:
        super().__init__(cfg)
        
        self.bert_model = AutoModel.from_pretrained(cfg.bert_model)

        for p in self.bert_model.parameters():
            p.requires_grad = cfg.trainable

        self.compressor = torch.nn.Linear(self.bert_model.config.hidden_size, cfg.compression_dim)

    def forward(self,
                query: Dict[str, torch.LongTensor],
                document: Dict[str, torch.LongTensor]):

        query_vecs = self.forward_representation(query)
        document_vecs = self.forward_representation(document)

        score = self.forward_aggregation(query_vecs,document_vecs,query["attention_mask"],document["attention_mask"])
        return score

    def forward_representation(self,
                               tokens,
                               sequence_type=None) -> torch.Tensor:
        
        vecs = self.bert_model(**tokens)[0]
        vecs = self.compressor(vecs)


        if sequence_type == "doc_encode" or sequence_type == "query_encode": 
            vecs = vecs * tokens["tokens"]["mask"].unsqueeze(-1)

        return vecs

    def forward_aggregation(self,query_vecs, document_vecs,query_mask,document_mask):

        #assert query_vecs.size(0) == document_vecs.size(0), "Batch sizes must match."


        #assert query_vecs.size(2) == document_vecs.size(1), "Dimension mismatch for batch matrix multiplication."
        #print("Query batch size:", query_vecs.size(0))
        #print("Document batch size:", document_vecs.size(0))


        score = torch.bmm(query_vecs, document_vecs.transpose(2,1))


        exp_mask = document_mask.bool().unsqueeze(1).expand(-1,score.shape[1],-1)
        score[~exp_mask] = - 10000


        score = score.max(-1).values


        score[~(query_mask.bool())] = 0


        score = score.sum(-1)

        return score


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = ColBERT.from_pretrained("sebastian-hofstaetter/colbert-distilbert-margin_mse-T2-msmarco")

# Example query and passages
#query = ["What is the capital of France?", "What are the benefits of exercise?"]
#passages = ["Paris is the capital of France.", "Berlin is the capital of Germany.", "London is the capital of the United Kingdom."]
#passages = ["Paris is the capital of France.", "Regular exercise has numerous benefits for both physical and mental health. It can improve cardiovascular health, boost mood, and increase overall well-being."]

sc=[]
for i in tqdm(range(len(query1))):

    query=query1[i]
    passage=passages1[i]
    query_tokens = tokenizer(query, return_tensors="pt")
    passage_tokens = tokenizer(passage, return_tensors="pt", padding=True, truncation=True)


    query_representation = model.forward_representation(query_tokens)
    passage_representation = model.forward_representation(passage_tokens)

    scores = model.forward_aggregation(query_representation, passage_representation, query_tokens["attention_mask"], passage_tokens["attention_mask"])
    sc.append(scores.tolist())


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 28734/28734 [23:29<00:00, 20.38it/s]


### Score Processing

In [8]:
flattened_sc = [item for sublist in sc for item in sublist]

In [9]:
d = pd.DataFrame({'colbert_score': flattened_sc})
d

Unnamed: 0,colbert_score
0,61.947403
1,54.893806
2,52.178833
3,52.660133
4,51.979610
...,...
28729,39.560165
28730,34.969055
28731,35.290092
28732,36.059559


In [10]:
df2['colbert_score']= flattened_sc

In [11]:
df2

Unnamed: 0.1,index,Unnamed: 0,qid,pid,query,passage,Q0,relevance,bm25_score,rank_BM,colbert_score
0,2907770,2907770,1102400,7287406,why do bears hibernate,"Why do Bears hibernate? March 31, 2010, Joan, ...",,0,9.723291,1,61.947403
1,3047664,3047664,1102400,1171598,why do bears hibernate,There are a number of different reasons why br...,,0,8.420756,2,54.893806
2,1140055,1140055,1102400,7968404,why do bears hibernate,Site Navigation. JGordon hooked me up with thi...,,0,7.832838,3,52.178833
3,551264,551264,1102400,8196927,why do bears hibernate,5. Why do polar bears like to keep clean? Pola...,,0,7.641037,4,52.660133
4,515512,515512,1102400,7738009,why do bears hibernate,2. What is the difference between brown bears ...,,0,7.599777,5,51.979610
...,...,...,...,...,...,...,...,...,...,...,...
28729,385486,385486,2,6078653,Androgen receptor define,Define the terms sensation and perception and ...,,0,6.547841,1,39.560165
28730,3847279,3847279,2,3672136,Androgen receptor define,• Class 11 Physics Demo. • habitat isolation e...,,0,6.201707,2,34.969055
28731,948159,948159,2,5490907,Androgen receptor define,This behavior can be turned off by setting ver...,,0,6.005980,3,35.290092
28732,724668,724668,2,2511437,Androgen receptor define,Define Volatile Fatty Acids (VFAs). Define Pol...,,0,5.973955,4,36.059559


In [12]:
df2 = df2.sort_values(['qid', 'colbert_score'], ascending=False)
df2['rank_colbert'] = df2.groupby('qid')['colbert_score'].rank(ascending=False).astype(int)

# MRR

In [13]:
total_mrr = 0.0
r=[]

for unique_id in df2['qid'].unique():

    current_id_df = df2[df2['qid'] == unique_id]

    current_id_df = current_id_df.sort_values(by='rank_colbert')

    rank_first_relevant = current_id_df.loc[current_id_df['relevance'] == 1, 'rank_colbert'].min()

    mrr = 1 / rank_first_relevant if pd.notnull(rank_first_relevant) else 0
    
    # Set number of documents to consider rank_first_relevant<10, 1000 default
    if rank_first_relevant<100:

        total_mrr += mrr
    r.append(rank_first_relevant)

# Calculate Mean Reciprocal Rank
mean_mrr = total_mrr / len(df2['qid'].unique())

In [14]:
mean_mrr

0.1302974841671026