In [None]:
!pip install sentence-transformers groq

In [None]:
!pip install datasets
!pip install transformers faiss-cpu torch datasets
!pip install einops

### Load the dataset- HotspotQA

In [None]:
from datasets import load_dataset

# Load the HotpotQA dataset
dataset = load_dataset("hotpot_qa", "fullwiki")

# Check out the dataset structure
print(dataset)

#### Taking only 100 data points due to computational constraints

In [None]:
from tqdm import tqdm
context = []
for i in tqdm(dataset['train']['context'][:100]):
    for j in i['sentences']:
        for sent in j:
            context.append(sent)
context = list(set(context))

In [None]:
questions = dataset['train']['question'][:100]
answer = dataset['train']['answer'][:100]

In [None]:
questions[0]

In [None]:
answer[0]

In [None]:
len(context)

## Retrieval and Reranking

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss

In [None]:
HF_TOKEN = "HUGGING_FACE_TOKEN"

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2", use_auth_token = HF_TOKEN)
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2", use_auth_token = HF_TOKEN)

#### Getting embeddings of corpus and questions

In [None]:
# creating embeddings
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

corpus_embeddings = [get_embedding(doc, tokenizer, model) for doc in tqdm(context)]


In [None]:
query_embedding = [get_embedding(ques, tokenizer, model) for ques in tqdm(questions)]

#### Creating FAISS index and searching top 10 documents for 1st query (question[0])

In [None]:
import numpy as np
# Initialize FAISS index
embedding_dim = corpus_embeddings[0].shape[0]
index = faiss.IndexFlatL2(embedding_dim)

# Add corpus embeddings to the index
corpus_embeddings_np = np.array(corpus_embeddings)
index.add(corpus_embeddings_np)

# Search for top-k nearest documents
k = 3
_, top_k_indices = index.search(np.expand_dims(query_embedding[0], axis=0), k)

# Retrieve top-k documents
top_k_documents = [context[idx] for idx in top_k_indices[0]]
print("Question:", questions[0])
print("Top-k documents:", top_k_documents)

### Reranking

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
def sort_indices_descending(lst):
    return sorted(range(len(lst)), key=lambda i: lst[i], reverse=True)

In [None]:
def retrieve_tok_k_items(query_idx, top_k=10):
    query = [questions[query_idx]]
    q_embed = query_embedding[query_idx]
    D, I = index.search(np.expand_dims(q_embed, axis = 0), top_k)
    
    top_k_idx = I[0]
    top_k_documents = [context[idx] for idx in I[0]]

    return top_k_documents

In [None]:
def get_context_docs(query_idx):
    flat_list = [item for sublist in dataset['train']['context'][query_idx]['sentences'] for item in sublist]
    return flat_list

In [None]:
def get_reranked_docs(query_idx):
    cross_input_list = []
    top_k_documents = retrieve_tok_k_items(query_idx)
    for item in top_k_documents:
        new_list = [questions[query_idx], item]
        cross_input_list.append(new_list)
        
    cross_scores = cross_encoder.predict(cross_input_list)
    order_of_ranking = sort_indices_descending(cross_scores)
    print(order_of_ranking)
    ranked_docs = []
    is_in_orig_context = []
    flat_list = get_context_docs(query_idx)
    for j in order_of_ranking:
        ranked_docs.append(cross_input_list[j][1])
        is_in_orig_context.append(cross_input_list[j][1] in flat_list)
    
    return ranked_docs, sorted(cross_scores, reverse=True), is_in_orig_context

In [None]:
def assign_relevance_labels(reranking_scores):

    # Step 1: Normalize the reranking scores to range [0, 1]
    min_score = np.min(reranking_scores)
    max_score = np.max(reranking_scores)
    normalized_scores = (reranking_scores - min_score) / (max_score - min_score)

    # Step 2: Define thresholds for relevance bins
    n_docs = len(normalized_scores)
    sorted_indices = np.argsort(-normalized_scores)  # Sort in descending order

    top_20_percent = int(n_docs * 0.2)
    next_30_percent = int(n_docs * 0.3)

    # Step 3: Assign relevance labels based on thresholds
    relevance_labels = np.zeros(n_docs)
    relevance_labels[sorted_indices[:top_20_percent]] = 2  # Top 20% -> relevance 2
    relevance_labels[sorted_indices[top_20_percent:top_20_percent + next_30_percent]] = 1  # Next 30% -> relevance 1
    relevance_labels[sorted_indices[top_20_percent + next_30_percent:]] = 0  # Bottom 50% -> relevance 0

    return relevance_labels

"""

def assign_relevance_labels(reranking_scores):
    relevance_labels = []
    for r in reranking_scores:
        if(r>0):
            relevance_labels.append(1)
        else:
            relevance_labels.append(0)
    return relevance_labels
    
"""

In [None]:
from groq import Groq

client = Groq(api_key = "GROQ_API_KEY")

In [None]:
def assign_actual_relevance_scores(df):
    df['relevance_scores_actual'] = np.nan
    for i in range(len(df)):
        completion = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {
                "role": "system",
                "content": "Given a context and a question, provide relevance rating to the context based on how relevant the context is in answering the question. The relevance ratings can only be one of 0 (completely irrelevant), 1(somewhat relevant), 2(completely relevant).\n\noutput only the relevance rating"
            },
            {
                "role": "user",
                "content": f"Context: {df['ranked_retrieved_docs'].iloc[i]}\n\nQuestion: {df['query'].iloc[i]}"
            }
        ],
            temperature=1,
            max_tokens=1024,
            top_p=1,
            stream=False,
            stop=None,
        )
    
        df['relevance_scores_actual'].iloc[i] = completion.choices[0].message.content
    return df

In [None]:
import pandas as pd
ranked_docs, cross_scores, is_in_orig_context = get_reranked_docs(8)
df = pd.DataFrame(ranked_docs, columns = ['ranked_retrieved_docs'])
df['ranked_retrieved_docs_score'] = cross_scores
df['is_in_orig_context'] = is_in_orig_context
df['query'] = questions[8]
relevance_labels = assign_relevance_labels(df['ranked_retrieved_docs_score'].values)
print(relevance_labels)
df['relevance_labels_reranked'] = relevance_labels
df = assign_actual_relevance_scores(df)
df

In [None]:
import numpy as np
import pandas as pd


df_sorted = df.sort_values(by='relevance_labels_reranked', ascending=False)

# Calculate DCG
def dcg_at_k(relevance_scores, k):
    relevance_scores = np.asfarray(relevance_scores)[:k]
    if relevance_scores.size:
        return np.sum((2 ** relevance_scores - 1) / np.log2(np.arange(1, relevance_scores.size + 1) + 1))
    return 0.0

# Calculate IDCG (Ideal DCG) - ideal relevance is sorted in descending order
def idcg_at_k(relevance_scores, k):
    ideal_relevance = sorted(relevance_scores, reverse=True)
    return dcg_at_k(ideal_relevance, k)

# Calculate nDCG
def ndcg_at_k(actual_relevance_scores, predicted_relevance_scores, k):
    # Calculate DCG using actual relevance scores in the order of predicted relevance
    dcg = dcg_at_k(actual_relevance_scores, k)
    
    # Calculate IDCG using the actual relevance scores sorted ideally
    idcg = idcg_at_k(actual_relevance_scores, k)
    
    return dcg / idcg if idcg > 0 else 0.0

# Get the actual relevance scores after sorting by the predicted relevance
actual_relevance_scores = np.float64(df_sorted['relevance_scores_actual'])
predicted_relevance_scores = df_sorted['relevance_labels_reranked']

# Set k (you can choose a specific value or use the full list)
k = len(actual_relevance_scores)  # Or choose any k value

# Calculate nDCG
ndcg_value = ndcg_at_k(actual_relevance_scores, predicted_relevance_scores, k)
print(f"nDCG at {k}: {ndcg_value}")


In [None]:
total_ndcg_at_10 = 0
for q in tqdm(range(len(questions))):
    ranked_docs, cross_scores, is_in_orig_context = get_reranked_docs(q)
    df = pd.DataFrame(ranked_docs, columns = ['ranked_retrieved_docs'])
    df['ranked_retrieved_docs_score'] = cross_scores
    df['is_in_orig_context'] = is_in_orig_context
    df['query'] = questions[q]
    relevance_labels = assign_relevance_labels(df['ranked_retrieved_docs_score'].values)
    df['relevance_labels_reranked'] = relevance_labels
    df = assign_actual_relevance_scores(df)
    k = 10
    df_sorted = df.sort_values(by='relevance_labels_reranked', ascending=False)
    actual_relevance_scores = df_sorted['relevance_scores_actual']
    predicted_relevance_scores = df_sorted['relevance_labels_reranked']
    ndcg_score = ndcg_at_k(actual_relevance_scores, predicted_relevance_scores, k)
    print(f"NDCG Score for Query {q} is {ndcg_score}")
    total_ndcg_at_10+=ndcg_score



In [None]:
print(f"Average NDCG Score on {len(questions)} queries is: {total_ndcg_at_10/len(questions)}")

In [None]:
total_prec = 0
total_recall = 0
for q in tqdm(range(len(questions))):
    ranked_docs, cross_scores, is_in_orig_context = get_reranked_docs(q)
    df = pd.DataFrame(ranked_docs, columns = ['ranked_retrieved_docs'])
    df['ranked_retrieved_docs_score'] = cross_scores
    df['is_in_orig_context'] = is_in_orig_context
    df['query'] = questions[q]
    relevance_labels = assign_relevance_labels(df['ranked_retrieved_docs_score'].values)
    df['relevance_labels_reranked'] = relevance_labels
    df = assign_actual_relevance_scores(df)
    k = 10
    #df_sorted = df.sort_values(by='relevance_labels_reranked', ascending=False)
    relevant_docs = 0
    actual_relevant_docs = 0
    df['relevance_scores_actual'] = np.float64(df['relevance_scores_actual'].values)
    for i in range(k):
        if(df['relevance_labels_reranked'].iloc[i]>0 and df['relevance_scores_actual'].iloc[i]>0):
            relevant_docs+=1
        if(df['relevance_scores_actual'].iloc[i]>0):
            actual_relevant_docs+=1
    precision_at_k = relevant_docs/k
    recall_at_k = 0
    if actual_relevant_docs!=0 : 
        recall_at_k = relevant_docs/actual_relevant_docs
    
    print(f"Precision Score for Query {q} is {precision_at_k}")
    print(f"Recall Score for Query {q} is {recall_at_k}")
    total_prec+=precision_at_k
    total_recall+=recall_at_k


In [None]:
print(f"Average Precision Score on {len(questions)} queries is: {total_prec/len(questions)}")
print(f"Average Recall Score on {len(questions)} queries is: {total_recall/len(questions)}")