# Retrieve and Re-Rank through BM25 injection

In [4]:
import json
from tqdm import tqdm
from beir.datasets.data_loader import GenericDataLoader
from crossencoder_bm25 import CustomCrossEncoder
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
from load_dataset import load_queries

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

def load_corpus(collection_filepath):
    """Read the corpus files, that contain all the passages.
    Store them in the corpus dict.
    """
    corpus = {}
    passages = []
    with open(collection_filepath, "r", encoding="utf8") as fIn:
        for line in tqdm(fIn, unit_scale=True):
            pid, passage = line.strip().split("\t")
            corpus[pid] = passage
            passages.append(passage)
    return corpus, passages

# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     # Truncate long passages to 256 tokens
top_k = 32                          # Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CustomCrossEncoder("output/training_ms-marco_cross-encoder-microsoft-MiniLM-L12-H384-uncased-2023-05-20_12-13-19-latest")

corpus, passages = load_corpus("data/msmarco-passage/collection.tsv")
queries = load_queries("data/msmarco-passage/queries.dev.small.tsv")

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 60 minutes on a good GPU
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

8.84Mit [00:11, 803kit/s]
6.98kit [00:00, 1.61Mit/s]


Passages: 8841823


In [5]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

100%|██████████| 8841823/8841823 [04:31<00:00, 32588.51it/s]


In [None]:
from pyserini.index.lucene import IndexReader


index_reader = IndexReader("data/msmarco-index/")

In [37]:
from src.bm25 import *

# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    # question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    # question_embedding = question_embedding.cuda()
    # hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    # hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    # bm25_scores = np.array([index_reader.compute_query_document_score(str(hit['corpus_id']), str(query)) for hit in bm25_hits])
    # bm25_scores =  min_max_global(bm25_scores)
    
    cross_inp = [[query, str(int(min_max_global(index_reader.compute_query_document_score(str(hit['corpus_id']), query), 0, 50))), passages[hit['corpus_id']]] for hit in bm25_hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    # print("\n-------------------------\n")
    # print("Top-3 Bi-Encoder Retrieval hits")
    # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    # for hit in hits[0:3]:
    #     print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))

In [38]:
i = 0
for docid in queries:
    search(queries[docid])
    print()
    print()
    i+=1
    if i >=3:
        break

Input question: what is paula deen's brother
Top-3 lexical search (BM25) hits
	39.902	Racial scandals aren't always bad for business ... just ask Paula Deen's brother who says the embattled chef's restaurants have been slammed withâ¦ Racial scandals aren't always bad for business ... just ask Paula Deen's brother who says the embattled chef's restaurants have been slammed withâ¦
	33.048	Paula Deen's Sprawling Savannah Mansion Is for Sale (PHOTOS) Celebrity chef Paula Deen's estate on Georgia's Wilmington Island is for sale. Located about 10 miles outside downtown Savannah, this home is the ultimate Southern retreat.
	31.520	The publisher released a statement on Friday, saying it would not release Deen's forthcoming cookbook, Paula Deen's New Testament: 250 Recipes: All Lightened Up as well as four other cookbooks the chef had been contracted to write.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.79 GiB total capacity; 6.89 GiB already allocated; 9.06 MiB free; 7.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF