In [None]:
import torch
print('The GPU Assigned is:', torch.cuda.get_device_name(0)) # Tesla P100-PCIE-16GB is good, if assigned a different GPU, delete the runtime, and connect again

In [None]:
# Install necessary packages
%pip install transformers
%pip install python-terrier

In [None]:
import os
import math
import random
import csv

import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from google.colab import drive
import pyterrier as pt
from pyterrier.measures import *

# Initialize PyTerrier
if not pt.started():
    pt.init()

# Mount Google Drive
drive.mount('/content/drive')

# Directories
DATA_DIR = '/content/drive/MyDrive/MSMARCO/'
FULL_INDEX_DIR = '/content/drive/MyDrive/Full_Index/'
PASSAGE_INDEX_DIR = '/content/drive/MyDrive/Passage_Index/'

# Print GPU information
print('We will use the GPU:', torch.cuda.get_device_name(0))
DEVICE = torch.device("cuda")


In [None]:
#Initilize tokenizer and model

TOKENIZER = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
MODEL = LongformerForSequenceClassification.from_pretrained('/content/drive/MyDrive/Longformer_checkpoint2/', num_labels = 2) #Finetuned model
MODEL.to(DEVICE) # Model to GPU
MODEL.eval() # Evaluation mode

In [None]:
# This function handels the toknizer and returns TensorDataset to be fed to the model

def tokenize(dataset, max_length=1024):
    input_ids, attention_masks, global_attention_masks = [], [], []

    for i in tqdm(range(len(dataset))):
        encoded_dict = TOKENIZER.encode_plus(
            str(dataset.iloc[i].query),  # Query
            str(dataset.iloc[i].body),  # Document
            add_special_tokens=True,  # Add '<s>' and '</s>'
            max_length=max_length,  # Pad & truncate all sentences
            padding='max_length',
            return_attention_mask=True,  # Construct attn. masks
            truncation='only_second',
            return_tensors='pt'  # Return pytorch tensors
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        global_attention = [0] * max_length
        range_with_cls = len(nlp(dataset.iloc[i].query)) + 1
        for j in range(range_with_cls):
            global_attention[j] = 1
        global_attention_masks.append(torch.tensor([global_attention]))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    global_attention_masks = torch.cat(global_attention_masks, dim=0)

    return TensorDataset(input_ids, attention_masks, global_attention_masks)

In [None]:
# Load MSMARCO dataset
msmarco_dataset = pt.get_dataset("msmarco_document")
qrels = msmarco_dataset.get_qrels('test')
topics = msmarco_dataset.get_topics('test')
index = pt.IndexFactory.of(FULL_INDEX_DIR)
index2 = pt.IndexFactory.of(PASSAGE_INDEX_DIR)

In [None]:
# BM25 retrieval
bm25 = pt.BatchRetrieve(index, wmodel="BM25", metadata=["title", "docno", "body"]) % 200 # Retrieve only top 200 docs

In [None]:
def longformer(ranked_results, query_list_length, query=None, model=MODEL, device=DEVICE, batch_size=8):
    ranked_results_tensor = tokenize(ranked_results)

    dataloader = DataLoader(
        ranked_results_tensor,
        sampler=SequentialSampler(ranked_results_tensor),
        batch_size=batch_size
    )

    pred_tensor = []
    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_global = batch[2].to(device)

        with torch.no_grad():
            result = model(
                b_input_ids,
                attention_mask=b_input_mask,
                global_attention_mask=b_global,
                return_dict=True
            )

        logits = result.logits
        pred_tensor.append(logits)

    pred = torch.cat(pred_tensor, 0)
    prob = torch.sigmoid(pred)  # To get probabilities from logits (since we only have two labels, we use sigmoid)
    prob_final = np.delete(prob.cpu().numpy(), 0, 1).flatten()  # Deletes first column with probability of not being relevant

    for i in range(len(ranked_results)):
        ranked_results.iloc[i, ranked_results.columns.get_loc('score')] = prob_final[i]

    if query is not None:
        ranked_results_list = []
        for i in range(query_list_length):
            df = ranked_results[ranked_results['qid'] == query.iloc[i].qid] # Extracting docs for each query to rank them
            df = df.sort_values(by=['score'], ascending=False) # Sorting by score
            df = df.values.tolist()
            for p in range(len(df)):
                df[p][5] = p  # Adding the rank
            ranked_results_list.append(df)
        ranked_results_list = [x for xs in ranked_results_list for x in xs]  # Flattening the list
        ranked_results = pd.DataFrame(ranked_results_list, columns=['qid', 'docid', 'title', 'docno', 'body', 'rank', 'score', 'query'])
    else:
        # If the function gets a single query instead of a dataframe
        ranked_results = ranked_results.sort_values(by=['score'], ascending=False)
        for i in range(len(ranked_results)):
            ranked_results.iloc[i, ranked_results.columns.get_loc('rank')] = i
        ranked_results = ranked_results.reset_index(drop=True)

    return ranked_results


In [None]:
# Returns a tuple of a token and raw count
def get_tokens(index2, docno):
    token_list, termslist = [], []
    docid = index2.getMetaIndex().getDocument("docno", docno) # Returns docid
    pointer = index2.getDocumentIndex().getDocumentEntry(docid) # Returns DocumentIndexEntry which can be used as a pointer 
    iterator = index2.getDirectIndex().getPostings(pointer) # Returns the posting iterator

    for p in iterator:
        raw_count = p.getFrequency()
        termid = p.getId() # Returns term id in the lexicon 
        term = index2.getLexicon().getLexiconEntry(termid).getKey() # Returns the key (the word itself)
        token_list.append([term, raw_count])
        termslist.append(term)

    return token_list, termslist

In [None]:
def PRF(PRF_list, query, k, qid=None): # PRF = Pseudo Relevance Feedback 
    # Parameters:
    # PRF_List: The Top K docs dataframe
    # query: Query text
    # k: The number of tokens to be added to the expanded query
    # qid: Query ID
    
    all_tokens = [] # List of tokens
    all_terms = [] # List of term (Vocabulary)

    for i in range(len(PRF_list)):
        all_tknz, all_trms = get_tokens(index2, PRF_list.iloc[i].docno) # This gets the list of tokens from the the document
        all_tokens.append(all_tknz) # This list has tokens and raw count
        all_terms.append(all_trms) # This list has just tokens, I will convert it to a list of terms using set()

    # Flatten to have all tokens from all docs in one list
    flatten_tokenz_list = [x for xs in all_tokens for x in xs] 
    # Flatten to have all terms (Vocab) from all docs in one list, I used set() here to remove duplicate terms
    flatten_terms_list = set([x for xs in all_terms for x in xs]) 

    # List of all terms with frequency in the top K documents and number of document that has this term
    list_of_terms_with_raw_count_number_of_docs = []
    for term in flatten_terms_list:
        total_raw_count, number_of_docs_that_has_term = 0, 0
        for sublist in flatten_tokenz_list:
            if sublist[0] == term:
                number_of_docs_that_has_term += 1 # Getting the number of docs that has this term
                total_raw_count += sublist[1] # Summing raw count for this token from all docs that has it
        list_of_terms_with_raw_count_number_of_docs.append(
            (term, total_raw_count, number_of_docs_that_has_term)
        )

    tokens_score = []
    number_of_documents = index2.getCollectionStatistics().getNumberOfDocuments()
    for term_info in list_of_terms_with_raw_count_number_of_docs:
        if term_info[2] >= 3: # if token appear in 3 of the top 5 docuemnts
            try:
                coll_freq = index2.getLexicon()[term_info[0]].getFrequency()
                f = coll_freq / number_of_documents
                score = -math.log2(1 / (1 + f)) - (term_info[1] * math.log2(f / (1 + f)))
                tokens_score.append((term_info[0], score))
            except KeyError:
                print(term_info[0])

    sorted_by_score = sorted(tokens_score, key=lambda tup: tup[1], reverse=True)
    
    # add k tokens to the query
    tokens_to_be_added = [term for term, score in sorted_by_score[:k]]

    top_k_words = " ".join(tokens_to_be_added) # List of tokens converted to a string 
    
    # Return the new expanded query
    return [qid, query + " " + top_k_words] if qid else query + " " + top_k_words 


In [None]:

class LongformerReranking:
    def transform(self, query, model=MODEL, device=DEVICE):
        query_list_length = 1
        if isinstance(query, pd.DataFrame):
            query_list_length = len(query.index)
            reranked_list = longformer(bm25(query), query_list_length, query, model=model, device=device)  # Candidate list by BM25, reranking by Longformer
        else:
            reranked_list = longformer(bm25(query), query_list_length, model=model, device=device)  # Candidate list by BM25, reranking by Longformer

        return reranked_list

In [None]:
class LongformerRerankingPRF:
    def transform(self, query, bm25=None, model=MODEL, device=DEVICE, k=10, K=5):
        query_list_length = 1
        if isinstance(query, pd.DataFrame):
            query_list_length = len(query.index)
            reranked_list_prf = longformer(bm25(query), query_list_length, query, model=model, device=device)
            prf_list = []

            for i in range(query_list_length):
                query_docs = reranked_list_prf[reranked_list_prf['qid'] == str(query.iloc[i].qid)]
                top_k_docs = query_docs.head(K)  # Getting top K documents for PRF for each query
                expanded_q = PRF(top_k_docs, query.iloc[i].query, k, qid=topics.iloc[i].qid)  # These parameters are explained in PRF function
                reranked_list_prf.loc[reranked_list_prf['qid'] == str(query.iloc[i].qid), "query"] = expanded_q[1]

            # Reranking by Longformer using the expanded query
            reranked_list_prf = longformer(reranked_list_prf, query_list_length, query, model=model, device=device)

        return reranked_list_prf

In [None]:
Results = LongformerReranking.transform() # Takes a query in text format or dataframe (No PRF)
ResultsPRF = LongformerRerankingPRF.transform() # Takes a query in text format or dataframe (With PRF)