In [73]:
import pandas as pd
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import openai


In [74]:
# ---------------------------
# 1. API key for OpenAI GPT
# ---------------------------
from openai import OpenAI

client = OpenAI(api_key="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")





In [75]:
# ---------------------------
# 2. Load dataset
# ---------------------------
df = pd.read_csv("IR_dataset_entries.csv")
corpus = df['content'].tolist()

# ---------------------------
# Load Evaluation Dataset
# ---------------------------
eval_df = pd.read_csv("evaluation_set.csv")

In [77]:
# ---------------------------
# 3. BM25 Sparse Retrieval
# ---------------------------
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [78]:
# ---------------------------
# 4. Dense Retrieval
# ---------------------------
dense_model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = dense_model.encode(corpus, convert_to_tensor=True)

In [79]:
# ---------------------------
# 5. Cross-Encoder for Re-ranking
# ---------------------------
cross_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [80]:
import pandas as pd
import numpy as np

# Load evaluation dataset
eval_df = pd.read_csv("evaluation_set.csv")

# ---------------------------
# Evaluation metric functions
# ---------------------------
def compute_precision(relevant_docs, retrieved_docs):
    return len(set(relevant_docs).intersection(set(retrieved_docs))) / len(retrieved_docs)

def compute_recall(relevant_docs, retrieved_docs):
    return len(set(relevant_docs).intersection(set(retrieved_docs))) / len(relevant_docs)

def compute_mrr(relevant_docs, ranked_docs):
    for rank, doc in enumerate(ranked_docs, start=1):
        if doc in relevant_docs:
            return 1 / rank
    return 0

def compute_ndcg(relevant_docs, ranked_docs):
    def dcg(scores):
        return sum(score / np.log2(i + 2) for i, score in enumerate(scores))
    relevance_scores = [1 if doc in relevant_docs else 0 for doc in ranked_docs]
    ideal_scores = sorted(relevance_scores, reverse=True)
    if sum(ideal_scores) == 0:
        return 0
    return dcg(relevance_scores) / dcg(ideal_scores)

# ---------------------------
# Run evaluation
# ---------------------------
results = []

for i, row in eval_df.iterrows():
    query = row["query"]
    relevant_ids = eval(row["relevant_ids"])  # "[4]" → [4]

    # Run your RAG pipeline
    answer, retrieved_docs = chat_with_rag(query)

    # Convert retrieved docs → doc IDs
    retrieved_ids = []
    for doc in retrieved_docs:
        doc_id = df[df["content"] == doc]["id"].values[0]
        retrieved_ids.append(doc_id)

    # Compute metrics
    precision = compute_precision(relevant_ids, retrieved_ids)
    recall = compute_recall(relevant_ids, retrieved_ids)
    mrr = compute_mrr(relevant_ids, retrieved_ids)
    ndcg = compute_ndcg(relevant_ids, retrieved_ids)

    results.append({
        "Query": query,
        "Relevant IDs": relevant_ids,
        "Retrieved IDs": retrieved_ids,
        "Precision@5": precision,
        "Recall@5": recall,
        "MRR": mrr,
        "nDCG": ndcg
    })

# Display results
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Query,Relevant IDs,Retrieved IDs,Precision@5,Recall@5,MRR,nDCG
0,What is BM25?,[4],"[999, 412, 997, 820, 800]",0.0,0.0,0.0,0.0
1,What is semantic search?,[5],"[380, 369, 378, 464, 36]",0.0,0.0,0.0,0.0
2,Explain embedding techniques.,[1],"[1, 17, 18, 451, 303]",0.2,1.0,1.0,1.0
3,What is information retrieval?,[3],"[914, 673, 838, 585, 724]",0.0,0.0,0.0,0.0
4,What is query understanding?,[2],"[972, 837, 732, 995, 174]",0.0,0.0,0.0,0.0


In [81]:
# ---------------------------
# 6. Chatbot Loop
# ---------------------------
def chat_with_rag(query, top_k=5):
    # ----- BM25 initial retrieval -----
    tokenized_query = query.split(" ")
    bm25_top = bm25.get_top_n(tokenized_query, corpus, n=top_k*2)  # retrieve more for cross-ranking

    # ----- Dense retrieval -----
    query_embedding = dense_model.encode(query, convert_to_tensor=True)
    dense_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k*2)
    dense_top = [corpus[h['corpus_id']] for h in dense_hits[0]]

    # Combine BM25 + Dense results
    combined_top = list(dict.fromkeys(bm25_top + dense_top))[:top_k*3]

    # ----- Cross-Encoder Re-ranking -----
    cross_scores = cross_model.predict([[query, doc] for doc in combined_top])
    top_docs = [doc for _, doc in sorted(zip(cross_scores, combined_top), reverse=True)][:top_k]

    # ----- Contextual Answer Generation -----
    context = "\n".join(top_docs)
    prompt = f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}"

    response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=300
)

    answer = response.choices[0].message.content
    return answer, top_docs

In [82]:
# ---------------------------
# 7. User Interaction
# ---------------------------
print("RAG Chatbot Ready! Type 'exit' to quit.")
while True:
    user_query = input("You: ")
    if user_query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break

    # Run RAG pipeline
    answer, retrieved_docs = chat_with_rag(user_query)

    # ---------------------------
    # Evaluation Section
    # ---------------------------
    # Convert retrieved docs back to their document IDs
    retrieved_ids = []
    for doc in retrieved_docs:
        doc_id = df[df["content"] == doc]["id"].values[0]
        retrieved_ids.append(doc_id)

    # Check if this query exists in evaluation_set.csv
    matching_eval = eval_df[eval_df["query"] == user_query]

    if len(matching_eval) > 0:
        # Extract relevant IDs
        relevant_ids = matching_eval["relevant_ids"].values[0]
        relevant_ids = eval(relevant_ids)  # Convert string "[4]" → list [4]

        # Compute metrics
        precision = compute_precision(relevant_ids, retrieved_ids)
        recall = compute_recall(relevant_ids, retrieved_ids)
        mrr = compute_mrr(relevant_ids, retrieved_ids)
        ndcg = compute_ndcg(relevant_ids, retrieved_ids)

        print("\nEvaluation Metrics:")
        print("Precision:", precision)
        print("Recall:", recall)
        print("MRR:", mrr)
        print("nDCG:", ndcg)
    else:
        print("\n(No evaluation available for this query)")

    # ---------------------------
    # Chatbot Results
    # ---------------------------
    print("\nBot Answer:\n", answer)

    print("\nTop Retrieved Contexts:")
    for idx, doc in enumerate(retrieved_docs, 1):
        print(f"{idx}. {doc}")

    print("\n" + "-"*50 + "\n")


RAG Chatbot Ready! Type 'exit' to quit.

(No evaluation available for this query)

Bot Answer:
 Machine learning is a subset of artificial intelligence that involves the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. Instead of relying on hard-coded rules, machine learning systems learn from data to identify patterns and make decisions. This process typically involves training a model on a dataset, allowing it to adapt and improve over time. Machine learning can be applied to various fields, including information retrieval, where it helps in organizing, ranking, and retrieving information efficiently based on user queries.

Top Retrieved Contexts:
1. Information Retrieval Basics content example 710. This explains key concepts of LLM in IR.
2. Information Retrieval Basics content example 704. This explains key concepts of LLM in IR.
3. Information Retrieval Basics content example 427. This explains key concepts of L