In [None]:
import torch
print('The GPU Assigned is:', torch.cuda.get_device_name(0)) # Tesla P100-PCIE-16GB is really good, if assigned a different GPU, delete the runtime, and connect again

In [None]:
%pip install transformers
%pip install python-terrier

In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from google.colab import drive
from tqdm import tqdm
import random
import os
import math
import csv
import pyterrier as pt
from pyterrier.measures import *
if not pt.started():
  pt.init()

drive.mount('/content/drive')

data_dir = '/content/drive/MyDrive/MSMARCO/'
full_index_dir = '/content/drive/MyDrive/Full_Index/'
dir = '/content/drive/MyDrive/'
small_index_dir = '/content/drive/MyDrive/Passage_Index/'

print('We will use the GPU:', torch.cuda.get_device_name(0))
device = torch.device("cuda")

In [None]:
#Initilize tokenizer and model

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/checkpoint3/', num_labels = 2) #Finetuned model
model.cuda() #Model to GPU
model.eval() #Evaluation mode

In [None]:
def tokenize(DS):
  input_ids = []
  attention_masks = []
  token_type_ids = [] 

  for i in tqdm(range(len(DS))):

    encoded_dict = tokenizer.encode_plus(
                        str(DS.iloc[i].query), # Query
                        str(DS.iloc[i].body), # Document
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Maximum length of sequence (I went with BERT maximum)
                        padding='max_length',
                        return_attention_mask = True,   # Construct attention masks.
                        truncation='only_second',
                        return_tensors = 'pt',     # Return pytorch tensors.
                        return_token_type_ids=True # To differentiate query sequence from document sequence  
                   )
      # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

    token_type_ids.append(encoded_dict['token_type_ids'])


input_ids = torch.cat(input_ids, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

return TensorDataset(input_ids, attention_masks, token_type_ids)

In [None]:
msmarco_dataset = pt.get_dataset("msmarco_document")
qrels = msmarco_dataset.get_qrels('test')
topics = msmarco_dataset.get_topics('test')
index = pt.IndexFactory.of(full_index_dir)
index2 = pt.IndexFactory.of(passage_index_dir)

In [None]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25", metadata=["title","docno", "body"]) %200 # Retrieve only top 200 docs

In [None]:
# takes a list of candidate results and rerank it
def bert(ranked_results, query_list_length, query=None):
  ranked_results_tensor = tokenize(ranked_results)

  dataloader = DataLoader(
              ranked_results_tensor,
              sampler = SequentialSampler(ranked_results_tensor),
              batch_size=64
          )


  pred_tensor = []
  for batch in dataloader:

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_token_type_ids = batch[2].to(device)

    with torch.no_grad():
      result = model(b_input_ids, 
                      token_type_ids=b_token_type_ids, 
                      attention_mask=b_input_mask,
                      return_dict=True)

    logits = result.logits
    pred_tensor.append(logits)
  pred = torch.cat(pred_tensor, 0)  

  prob = torch.sigmoid(pred) # to get probabilities from logits (Since we only have two labels, I used sigmoid)

  # deletes first column which has the probability of not being relevant (we care about the probability of being relevant only)
  prob_final = np.delete(prob.cpu().numpy(), 0, 1).flatten() 

  # To get final classification where 1 is relevant and 0 is irrelevant (We do not care for this since this not a classifier)
  #Classification = np.argmax(prob.cpu().numpy(), axis=1).flatten() 

  for i in range(len(ranked_results)):
    ranked_results.iloc[i, ranked_results.columns.get_loc('score')] = prob_final[i] # Assigning scores to docs

  ranked_results_list = []
  if query is not None:
    for i in range(query_list_length):
      df = ranked_results[ranked_results['qid'] == query.iloc[i].qid] # Extracting docs for each query to rank them
      df = df.sort_values(by=['score'], ascending=False) # Sorting by score
      df = df.values.tolist()
      for p in range(len(df)):
        df[p][5] = p # Adding the rank
      ranked_results_list.append(df)
    ranked_results_list = [x for xs in ranked_results_list for x in xs] # Flattening the list
    ranked_results = pd.DataFrame(ranked_results_list, columns=['qid', 'docid', 'title', 'docno', 'body', 'rank', 'score', 'query'])
  else:
    # If the function get a single query instead of a dataframe 
      ranked_results = ranked_results.sort_values(by=['score'], ascending=False)
      for i in range(len(ranked_results)):
        ranked_results.iloc[i, ranked_results.columns.get_loc('rank')] = i
      ranked_results = ranked_results.reset_index(drop=True)

  return ranked_results

In [None]:
# Returns a tuple of a token and raw count
def get_tokens(index2, docno):
    token_list = []
    termslist = []
    docid = index2.getMetaIndex().getDocument("docno", docno) # Returns docid
    pointer = index2.getDocumentIndex().getDocumentEntry(docid) # Returns DocumentIndexEntry which can be used as a pointer 
    iterator = index2.getDirectIndex().getPostings(pointer) # Returns the posting iterator
    for p in iterator:
        raw_count = p.getFrequency()
        termid = p.getId() # Returns term id in the lexicon 
        term = index2.getLexicon().getLexiconEntry(termid).getKey() # Returns the key (the word itself)
        token_list.append([term, raw_count])
        termslist.append(term)
    return token_list, termslist

In [None]:
# Takes the top K doc list, query text, and k (number of tokens to be added to the query) 
# Returns the new expanded query
def PRF(PRF_List, query, k, qid=None):
    # Parameters:
    # PRF_List: The Top K docs dataframe
    # query: Query text
    # k: The number of tokens to be added to the expanded query
    # qid: Query ID


    all_tokens = [] # List of tokens
    all_terms = [] # List of term (Vocabulary)
    for i in range(len(PRF_List)):
      all_tknz, all_trms = get_tokens(index2, PRF_List.iloc[i].docno) # This gets the list of tokens from the the document 
      all_tokens.append(all_tknz) # This list has tokens and raw count
      all_terms.append(all_trms) # This list has just tokens, I will convert it to a list of terms using set()
    flatten_tokenz_list = [x for xs in all_tokens for x in xs] # Flatten to have all tokens from all docs in one list
    flatten_terms_list = [x for xs in all_terms for x in xs] # Flatten to have all terms (Vocab) from all docs in one list 
    flatten_terms_list = set(flatten_terms_list) # I used set here to remove duplicate terms

    list_of_terms_with_raw_count_number_of_docs = [] # List of all terms with frequency in the top K documents and number of document that has this term
    for i in flatten_terms_list:
      total_raw_count = 0
      number_of_docs_that_has_term = 0
      for sublist in flatten_tokenz_list:
        if sublist[0] == i:
          number_of_docs_that_has_term  = number_of_docs_that_has_term + 1 # Getting the number of docs that has this term
          total_raw_count = total_raw_count + sublist[1] # Summing raw count for this token from all docs that has it
      list_of_terms_with_raw_count_number_of_docs.append((i, total_raw_count, number_of_docs_that_has_term))

    tokens_score = []
    numberOfDocuments = index2.getCollectionStatistics().getNumberOfDocuments()
    for i in list_of_terms_with_raw_count_number_of_docs:
      if i[2] >= 3: # if token appear in 3 of the top 5 docuemnts
        try:
          CollFreq = index2.getLexicon()[i[0]].getFrequency()
          f = CollFreq / numberOfDocuments
          score = -math.log2(1/(1 + f)) - (i[1] * math.log2(f/(1 + f))) # Bo1 
          tokens_score.append((i[0], score))
        except:
          print(i[0])

    sorted_by_score = sorted(tokens_score, key=lambda tup: tup[1], reverse=True) # Sorting tuples by their score
    tokens_to_be_added = [] # tokens to be added to the original query
    s = 0
    while s != -1:
      if len(sorted_by_score) == s:
        break
    # if the current token is not in the tokens_to_be_added list, it will be added
      tokens_to_be_added.append(sorted_by_score[s][0])
      #print(sorted_by_score[s][1])
      s=s+1
      if len(tokens_to_be_added) == k:
        break
    top_k_words = " ".join(tokens_to_be_added) # List of tokens to a string 
    #return tokens_to_be_added
    if qid is not None:
      return [qid, query+" "+top_k_words] # The new expanded query
    else:
      return query+" "+top_k_words

In [None]:
class neural_reranking():
  def transform(query):
    query_list_length = 1
    if isinstance(query, pd.DataFrame):
      query_list_length = len(query.index)
      reranked_list = bert(bm25(query), query_list_length, query) # Candidate list by BM25, reranking by BERT
    else:
      reranked_list = bert(bm25(query), query_list_length) # Candidate list by BM25, reranking by BERT

    return reranked_list


In [None]:
class neural_reranking_PRF():
  def transform(query):
    K = 5 # Top K docs to choose token from 
    k = 10 # Top k tokens to be added to the query
    query_list_length = 1
    if isinstance(query, pd.DataFrame):
      query_list_length = len(query.index)
      reranked_list_PRF = bert(bm25(query), query_list_length) # Candidate list by BM25, reranking by BERT
      # List of docs to be sent to PRF
      PRF_List = []
      for i in range(query_list_length):
        Query_Docs = reranked_list_PRF[reranked_list_PRF['qid'] == str(query.iloc[i].qid)] # Since BERT returned all queries and their docs in one list, I had to seperate them by query
        Top_K_Docs = Query_Docs.head(K) #Getting top K documetns for PRF for each query
        Expanded_Q = PRF(Top_K_Docs, query.iloc[i].query, k, qid=topics.iloc[i].qid) # These parameters are explained in PRF function
        reranked_list_PRF.loc[reranked_list_PRF['qid'] == str(query.iloc[i].qid), "query"] = Expanded_Q[1] # replace every query with expanded query

      # reranking by BERT using the expanded query
      reranked_list_PRF = bert(reranked_list_PRF, query_list_length, query)

    return reranked_list_PRF


In [None]:
Results = neural_reranking.transform() # Takes a query in text format or dataframe (No PRF)
Results = neural_reranking_PRF.transform() # Takes a query in text format or dataframe (With PRF)