In [1]:
from utils import load_corpus, load_queries_and_answers
from retriever import BM25Retriever
from llm_wrapper import LLMWrapper

In [2]:
def construct_cot_step_prompt(original_question: str, 
                              accumulated_context: str, 
                              cot_so_far: list[str],
                              max_context_tokens=32000 # Approximate token limit for context
                             ) -> str:
    """
    Constructs the prompt for the LLM to generate the next CoT step.

    Args:
        original_question (str): The initial multi-hop question.
        accumulated_context (str): Text from all retrieved passages so far.
        cot_so_far (list[str]): A list of CoT sentences generated in previous steps.
        max_context_tokens (int): An approximate limit for the context part of the prompt
                                  to avoid exceeding LLM input limits. Simple character count for now.

    Returns:
        str: The formatted prompt string.
    """

    # Context Management (Simple version: truncate if too long) 
    reserved_chars_for_other_parts = len(original_question) + \
                                     len("\n".join(cot_so_far)) + \
                                     500 # For instructions and formatting
    
    # Max chars for context based on approximate token limit
    max_chars_for_context_in_prompt = (max_context_tokens * 3) - reserved_chars_for_other_parts # Using 3 as a char/token rough estimate

    if len(accumulated_context) > max_chars_for_context_in_prompt:
        print(f"Warning: Truncating accumulated context from {len(accumulated_context)} to {max_chars_for_context_in_prompt} chars for CoT prompt.")
        context_to_include = accumulated_context[:max_chars_for_context_in_prompt] + "...\n[Context Truncated]"
    else:
        context_to_include = accumulated_context
    
    # Constructing the Chain of Thought so far
    if cot_so_far:
        reasoning_history = "Previous reasoning steps:\n" + "\n".join(f"- {step}" for step in cot_so_far)
    else:
        reasoning_history = "This is the first reasoning step."

    # Prompt Template: It needs to be iterative. At each step, we're asking for the *next* thought.
    
    prompt = f"""You are a helpful assistant performing multi-step reasoning to answer a complex question.
                You will be given a question, some retrieved context, and any previous reasoning steps.
                Your task is to generate the *single next logical reasoning step* based on the available information to help answer the question.
                Focus on what information is still missing or what connection needs to be made.

                If you believe you have enough information from the context and previous reasoning to directly and confidently answer the original question,
                your response should start *ONLY* with the phrase "The answer is: " followed by the answer. Do not add any other prefix.

                Otherwise, provide only the single next reasoning sentence that helps progress towards the answer. Do not try to answer the question prematurely if you are still reasoning.

                Original Question: {original_question}

                Retrieved Context:
                ---
                {context_to_include}
                ---

                {reasoning_history}

                Based on the original question, the retrieved context, and the previous reasoning steps, what is the single next reasoning step or the final answer (if known)?
                Tip: The reasoning is a single sentences that tell you expected to be the missing information to answer the question.
                """
    
    return prompt


In [3]:
def construct_final_answer_prompt(original_question: str,
                                  full_accumulated_context: str,
                                  full_chain_of_thought: list[str],
                                  max_context_tokens_for_final_answer=35000 # Approx token limit for context
                                 ) -> str:
    """
    Constructs the prompt for the LLM to generate the final answer
    based on all gathered information (Direct Prompting style).

    Args:
        original_question (str): The initial multi-hop question.
        full_accumulated_context (str): Text from all retrieved passages from all hops.
        full_chain_of_thought (list[str]): All CoT sentences generated during the IR-CoT loop.
        max_context_tokens_for_final_answer (int): Approximate token limit for the context
                                                   and CoT part of the prompt.

    Returns:
        str: The formatted prompt string.
    """

    # Context and CoT Management (Simple version: truncate if too long)
    combined_evidence_text = "Reasoning Steps Taken:\n" + "\n".join(f"- {step}" for step in full_chain_of_thought) + \
                             "\n\nSupporting Retrieved Context:\n---\n" + full_accumulated_context + "\n---"

    # Rough character-based truncation
    reserved_chars_for_other_parts_final = len(original_question) + 300 # For instructions

    max_chars_for_evidence_in_prompt = (max_context_tokens_for_final_answer * 3) - reserved_chars_for_other_parts_final

    if len(combined_evidence_text) > max_chars_for_evidence_in_prompt:
        print(f"Warning: Truncating combined evidence from {len(combined_evidence_text)} to {max_chars_for_evidence_in_prompt} chars for final answer prompt.")
        evidence_to_include = combined_evidence_text[:max_chars_for_evidence_in_prompt] + "...\n[Evidence Truncated]"
    else:
        evidence_to_include = combined_evidence_text

    # Prompt Template (Direct Answer Style)
    prompt = f"""You are an intelligent assistant. Based on the following reasoning steps and supporting context,
                please provide a concise and direct answer to the original question.

                Original Question: {original_question}

                {evidence_to_include}

                Based on all the information above, what is the final answer to the original question?
                DIRECT ANSWER ONLY NO EXPLANATION('Yes','No', Name, Place, Number, etc.)
                The Final Answer for {original_question} is: """
    
    return prompt

In [4]:
def answer_question_with_ircot(
    original_question: str,
    retriever: BM25Retriever, # Type hint for clarity, replace with your retriever class
    llm_wrapper: LLMWrapper, # Type hint for clarity
    max_hops: int = 3,
    k_retrieve_per_hop: int = 2, # Number of documents to retrieve per hop
    max_cot_step_tokens: int = 75, # Max tokens for LLM generating a CoT step
    max_final_answer_tokens: int = 100, # Max tokens for LLM generating final answer
    verbose: bool = True,
    confident: bool = True,
    ground_truth: str = None  # Add ground truth parameter for comparison
) -> tuple[str, int, list[str], str]:
    """
    Answers a question using the Interleaving Retrieval and Chain-of-Thought (IR-CoT) process.

    Args:
        original_question (str): The complex multi-hop question.
        retriever (BM25Retriever): An initialized retriever object.
        llm_wrapper (LLMWrapper): An initialized LLM wrapper.
        max_hops (int): Maximum number of reason-retrieve iterations.
        k_retrieve_per_hop (int): Number of documents to retrieve at each hop.
        max_cot_step_tokens (int): Max new tokens for LLM when generating a CoT step.
        max_final_answer_tokens (int): Max new tokens for LLM when generating the final answer.
        verbose (bool): If True, prints detailed logs of the process.
        confident (bool): If True, stops when LLM indicates final answer.
        ground_truth (str): The correct answer for comparison (optional).

    Returns:
        tuple: (final_answer_text, num_hops_taken, full_chain_of_thought_list, full_accumulated_context_str)
    """
    if verbose:
        print(f"{'='*60}")
        print(f"{'STARTING IR-COT PROCESS':^60}")
        print(f"{'='*60}")
        print(f"Original Question: {original_question}")
        if ground_truth:
            print(f"Ground Truth Answer: {ground_truth}")
        print(f"{'='*60}")

    accumulated_context_str = ""
    full_chain_of_thought_list = []
    num_hops_taken = 0
    
    # Initial Retrieval based on the original question
    if verbose: 
        print(f"--- Initial Retrieval (Hop 0) ---")
        print(f"Query: '{original_question}'")
    
    current_query_for_retrieval = original_question
    initial_retrieved_docs = retriever.search(current_query_for_retrieval, k=k_retrieve_per_hop)
    
    if initial_retrieved_docs:
        if verbose:
            print(f"Retrieved {len(initial_retrieved_docs)} initial documents:")
            for i, doc in enumerate(initial_retrieved_docs, 1):
                print(f"   {i}. '{doc['title']}' (Score: {doc.get('score', 'N/A'):.4f})")
        
        for doc in initial_retrieved_docs:
            accumulated_context_str += f"Title: {doc['title']}\\nPassage: {doc['text']}\\n<endofpassage>\\n"
    else:
        if verbose: 
            print(f"No initial documents retrieved for query: '{current_query_for_retrieval}'\\n")
    
    # Iterative Reason-Retrieve Loop
    for hop_num in range(max_hops):
        num_hops_taken = hop_num + 1
        if verbose: 
            print(f"--- IR-CoT Hop {num_hops_taken}/{max_hops} ---")

        # Construct CoT Step Prompt
        cot_prompt = construct_cot_step_prompt(
            original_question=original_question,
            accumulated_context=accumulated_context_str,
            cot_so_far=full_chain_of_thought_list
        )
        
        # Generate Next CoT Step
        if verbose: print(f"Generating CoT step {num_hops_taken}...")
        next_cot_sentence = llm_wrapper.generate(
            cot_prompt,
            max_new_tokens=max_cot_step_tokens,
            temperature=0.5 # Lower temperature for more factual CoT steps
        )
        if verbose: 
            print(f"CoT Step {num_hops_taken} Output: {next_cot_sentence}")
        
        full_chain_of_thought_list.append(next_cot_sentence)

        # Check for Termination
        if next_cot_sentence.lower().startswith("the answer is:"):
            final_answer_text = next_cot_sentence[len("the answer is:"):].strip()
            if verbose: 
                print(f"LLM indicated final answer within CoT step {num_hops_taken}.")
                print(f"Extracted Answer: {final_answer_text}")
            if confident:
                break
            pass 

        # Update Query for Next Retrieval (use the last CoT sentence)
        current_query_for_retrieval = next_cot_sentence
        if current_query_for_retrieval.lower().startswith("the answer is:"): # Don't retrieve based on an answer
            if verbose: print("CoT step was an answer, skipping retrieval for this hop.")
            if hop_num == max_hops -1: # If it's an answer on the last hop, we're good.
                 break 
            else: # If it's an answer but not the last hop, let's assume we need to verify or just proceed.
                  # For now, we'll just skip retrieval if it looks like an answer.
                  continue

        if verbose: 
            print(f"Retrieving based on CoT step: '{current_query_for_retrieval[:100]}{'...' if len(current_query_for_retrieval) > 100 else ''}'")
        
        # Retrieve New Documents
        newly_retrieved_docs = retriever.search(current_query_for_retrieval, k=k_retrieve_per_hop)

        # Accumulate Context
        if newly_retrieved_docs:
            added_new_context = False
            new_docs_info = []
            for doc in newly_retrieved_docs:
                # Avoid re-adding identical content if title and passage match exactly.
                doc_full_text = f"Title: {doc['title']}\\nPassage: {doc['text']}\\n<endofpassage>\\n"
                if doc_full_text not in accumulated_context_str : # Basic check to avoid exact duplicates
                    accumulated_context_str += doc_full_text
                    added_new_context = True
                    new_docs_info.append((doc['title'], doc.get('score', 'N/A'), doc['text'][:150]))
                
            if added_new_context and verbose:
                print(f"Retrieved and added {len(new_docs_info)} new documents for hop {num_hops_taken}:")
                for i, (title, score, preview) in enumerate(new_docs_info, 1):
                    print(f"   {i}. '{title}' (Score: {score:.4f})")
                    print(f"       Preview: {preview}...")
            elif verbose:
                print(f"Retrieved {len(newly_retrieved_docs)} documents, but they might be duplicates or empty.")
        elif verbose:
            print(f"No new documents retrieved for hop {num_hops_taken}")
        
        # TODO: Break if no new context and CoT is not progressing.

    #Final Answer Generation
    if verbose: 
        print(f"--- Generating Final Answer (after {num_hops_taken} hops) ---")
    
    final_answer_prompt_str = construct_final_answer_prompt(
        original_question=original_question,
        full_accumulated_context=accumulated_context_str,
        full_chain_of_thought=full_chain_of_thought_list
    )
    
    final_answer_text = llm_wrapper.generate(
        final_answer_prompt_str,
        max_new_tokens=max_final_answer_tokens,
        temperature=0.3
    )
    
    if verbose:
        print(f"\\n{'='*60}")
        print(f"Final Generated Answer: {final_answer_text}")
        if ground_truth:
            print(f"Ground Truth Answer: {ground_truth}")
            print(f"Match: {'Yes' if final_answer_text.strip().lower() == ground_truth.strip().lower() else 'No'}")
        print(f"Total Hops Used: {num_hops_taken}")
        print(f"CoT Steps Generated: {len(full_chain_of_thought_list)}")
        print(f"{'='*60}")
        print(f"IR-COT PROCESS FINISHED")
        print(f"{'='*60}\\n")
        
    return final_answer_text, num_hops_taken, full_chain_of_thought_list, accumulated_context_str

In [5]:
print("Loading data...")
CORPUS_FILEPATH = "data\\multihoprag_corpus.txt"
QA_FILEPATH = "data\\MultiHopRAG.json"

corpus_documents = load_corpus(CORPUS_FILEPATH)
qa_dataset = load_queries_and_answers(QA_FILEPATH)

print(f"Loaded {len(corpus_documents)} documents and {len(qa_dataset)} questions.")

print("\nFirst document from corpus:")
print(corpus_documents[0])


Loading data...
Loaded 609 documents from corpus.
Loaded 2556 query-answer pairs.
Loaded 609 documents and 2556 questions.

First document from corpus:
{'id': 0, 'title': "200+ of the best deals from Amazon's Cyber Monday sale", 'passage': 'Table of Contents Table of Contents Echo, Fire TV, and Kindle deals Apple deals TV deals Laptop deals Headphone and earbud deals Tablet deals Gaming deals Speaker deals Vacuum deals Kitchen deals Smart home deals Fitness deals Beauty tech deals Drone deals Camera deals Lego deals Gift card deals\n\nUPDATE: Nov. 27, 2023, 5:00 a.m. EST This post has been updated with all of the latest Cyber Monday deals available at Amazon.\n\nAmazon is dragging out the year\'s biggest shopping holiday(s) into 11 days of deals.\n\nThe retail giant began its Black Friday sale in the early morning of Friday, Nov. 17 (a week ahead of schedule) and was on top of making the switch to Cyber Monday language in the wee hours of Saturday, Nov. 25. Official Cyber Monday mode, 

In [6]:
print("\nInitializing retriever...")
bm25_retriever = BM25Retriever(corpus_documents)


Initializing retriever...


In [7]:
from dotenv import load_dotenv
import os

load_dotenv()
os.getenv("OPENROUTER_API_KEY")

llm = LLMWrapper(
    model_identifier="gemma3:4b-it-qat",
    llm_type="ollama"
)

Connected to Ollama. Available models: ['phi4:14b', 'llama3.2:3b', 'gemma3:4b-it-qat', 'qwen3:8b']
LLMWrapper initialized for model: gemma3:4b-it-qat (type: ollama)


In [16]:
import random
index = random.randint(0,1000)

question = qa_dataset[index]['query']
answer = qa_dataset[index]['answer']

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Who is the individual associated with both the failed crypto exchange FTX and Alameda Research, alleged to have used deceitful practices for personal gain and influence, and is facing charges of fraud and conspiracy according to articles from Fortune and TechCrunch?
Answer: Sam Bankman-Fried


In [17]:
# Run IR-CoT
predicted_answer, hops, cot_chain, retrieved_ctx = answer_question_with_ircot(
    original_question=question,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    max_hops=10,
    max_cot_step_tokens=1000,
    max_final_answer_tokens=1000,
    k_retrieve_per_hop=2,
    verbose=True,
    confident=True
)

                  STARTING IR-COT PROCESS                   
Original Question: Who is the individual associated with both the failed crypto exchange FTX and Alameda Research, alleged to have used deceitful practices for personal gain and influence, and is facing charges of fraud and conspiracy according to articles from Fortune and TechCrunch?
--- Initial Retrieval (Hop 0) ---
Query: 'Who is the individual associated with both the failed crypto exchange FTX and Alameda Research, alleged to have used deceitful practices for personal gain and influence, and is facing charges of fraud and conspiracy according to articles from Fortune and TechCrunch?'
Retrieved 2 initial documents:
   1. 'The FTX trial is bigger than Sam Bankman-Fried' (Score: 117.0476)
   2. 'SBF’s trial starts soon, but how did he — and FTX — get here?' (Score: 116.0725)
--- IR-CoT Hop 1/10 ---
Generating CoT step 1...
CoT Step 1 Output: The answer is: Based on the provided context, Sam Bankman-Fried’s alleged deceitful

### Evaluation

In [1]:
from metric import calculate_f1_score
from run_pipelines import run_qa_system
from utils import load_corpus, load_queries_and_answers
from retriever import BM25Retriever
from llm_wrapper import LLMWrapper
from run_evaluation import run_comprehensive_evaluation

In [None]:
CORPUS_FILEPATH = "data\\multihoprag_corpus.txt"
QA_FILEPATH = "data\\MultiHopRAG.json"

corpus_docs = load_corpus(CORPUS_FILEPATH)
if corpus_docs:
    print(f"\nFirst document from corpus: {corpus_docs[0]}")
    print(f"Last document from corpus: {corpus_docs[-1]}")

In [None]:
print("Loading data...")
corpus_documents = load_corpus(CORPUS_FILEPATH)
qa_dataset = load_queries_and_answers(QA_FILEPATH)

In [None]:
print("\nInitializing retriever...")
bm25_retriever = BM25Retriever(corpus_documents)

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
os.getenv("OPENROUTER_API_KEY")

llm = LLMWrapper(
    model_identifier="gemma3:4b-it-qat",
    llm_type="ollama"
)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="multi-hop-ircot"
)

print("MULTI HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="single-hop-rag"
)

print("SINGLE HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="llm-only"
)

print("LLM ONLY: ")
print(results)


In [None]:
llm = LLMWrapper(
    model_identifier="qwen3:8b",
    llm_type="ollama"
)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="multi-hop-ircot"
)

print("MULTI HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="single-hop-rag"
)

print("SINGLE HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="llm-only"
)

print("LLM ONLY: ")
print(results)


In [None]:
llm = LLMWrapper(
    model_identifier="llama3.2:3b",
    llm_type="ollama"
)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="multi-hop-ircot"
)

print("MULTI HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="single-hop-rag"
)

print("SINGLE HOP: ")
print(results)

In [None]:
results, summary = run_comprehensive_evaluation(
    qa_dataset=qa_dataset,
    retriever=bm25_retriever,
    llm_wrapper=llm,
    num_samples=200,
    max_ircot_hops=10,
    k_retrieve_multi_hop=2,
    k_retrieve_single_hop=2,
    verbose=True,
    verbose_level=0,
    run_mode="llm-only"
)

print("LLM ONLY: ")
print(results)
