In [None]:
from qdrant_client import QdrantClient
from qdrant_client import models
from openai import OpenAI
import json
import uuid
import random 

In [None]:
texts = []

with open('../data/summarized_texts.json', 'r' , encoding="utf-8") as f2:
    texts = json.load(f2)

In [None]:
question_generation_prompt = """
You emulate a player of the Stardew Valley game.
Here is the text from a wiki page of this game, along with the page and the section it was extracted from.
Formulate a question that can be answered using these text materials.
Only return the question. The questions should be complete and concise.
Page title: {page_title}
Section title: {section_title}
Text: {text}\n
""".strip()


In [None]:
OpenAIclient = OpenAI()

def llm(prompt, model='gpt-5-nano'):
    response = OpenAIclient.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
def question_generation(knowledge_base , sampleNum = 10):

    evaluation_questions = []

    sample_kb= random.sample(knowledge_base, sampleNum)

    for kb in sample_kb:
        eval = kb
        prompt = question_generation_prompt.format(page_title=kb["page_title"], section_title=kb["section_title"],text=kb["text"]).strip()
        question = llm(prompt)
        eval["question"] = question
        evaluation_questions.append(eval)

    return evaluation_questions   



In [None]:
def compute_mrr_and_hitrate(results, k=5):
    """
    results: list of lists of tuples (ranked_docs, correct_doc_id)
             e.g. [ (["(page1,sec1)", "(page2,sec2)", ...], "(page2,sec2)") , ... ]
    """
    reciprocal_ranks = []
    hits = 0

    for ranked_docs, correct_doc in results:
        # Find rank (1-indexed)
        rank = None
        for i, doc in enumerate(ranked_docs[:k]):
            if doc == correct_doc:
                rank = i + 1
                break

        if rank:
            reciprocal_ranks.append(1.0 / rank)
            hits += 1
        else:
            reciprocal_ranks.append(0.0)

    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    hit_rate = hits / len(results)

    return mrr, hit_rate

In [None]:
def evaluate_search_function(search_function, knowledge_base, k=5, sampleNum=5):

    evaluation_dataset = question_generation(knowledge_base, sampleNum)
    results = []

    for dp in evaluation_dataset:
        search_results = search_function(query = dp["question"])
        correct_doc = (dp["page_title"], dp["section_title"])

        retrieved_ids = [
            (doc.payload["page_title"], doc.payload["section_title"])
            for doc in search_results
        ]

        results.append((retrieved_ids, correct_doc))

    mrr, hit_rate = compute_mrr_and_hitrate(results, k)
    print(f"MRR@{k}: {mrr:.3f}")
    print(f"HitRate@{k}: {hit_rate:.3f}")

    return {"MRR": mrr, "HitRate": hit_rate}

In [None]:
def evaluate_search_functions(search_functions, knowledge_base, k=5, sampleNum=5):
    """
    Evaluate multiple search functions on the same evaluation dataset.

    Args:
        search_functions (list): list of functions or (name, function) tuples.
        knowledge_base (list): list of documents with page_title, section_title, text.
        k (int): top-k results to consider.
        sampleNum (int): number of documents to sample for evaluation.
    """
    evaluation_dataset = question_generation(knowledge_base, sampleNum)
    all_results = {}

    for item in search_functions:
        # Handle both function and (name, function) tuple
        if isinstance(item, tuple):
            name, search_function = item
        else:
            search_function = item
            name = item.__name__

        print(f"\nEvaluating: {name}")
        results = []

        for dp in evaluation_dataset:
            query = dp["question"]
            correct_doc = (dp["page_title"], dp["section_title"])

            search_results = search_function(query=query)

            retrieved_ids = [
                (doc.payload["page_title"], doc.payload["section_title"])
                for doc in search_results
            ]

            results.append((retrieved_ids, correct_doc))

        mrr, hit_rate = compute_mrr_and_hitrate(results, k)
        print(f"{name} → MRR@{k}: {mrr:.3f}, HitRate@{k}: {hit_rate:.3f}")

        all_results[name] = {"MRR": mrr, "HitRate": hit_rate}

    return all_results


In [3]:
import sys
sys.path.append('../scripts')
from RAG_pipeline import multi_stage_search, rrf_search

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# judging the llms
import sys
sys.path.append('../scripts')
from llm_judge import LLMJudge

# Initialize
judge = LLMJudge(OpenAIclient, model='gpt-4o-mini')

# Evaluate a search function
results = judge.evaluate_retrieval_with_judge(
    search_function=multi_stage_search,
    knowledge_base=texts,
    sample_num=5
)

print(f"Average Scores: {results['average_scores']}")

# LLM Evaluation

In [8]:
from llm_eval import llm_eval

# Test different models
models = ["gpt-4o", "gpt-4o-mini", "gpt-5-mini"]
query = "How do I get iridium ore in Stardew Valley?"

result = llm_eval(models, query)
print(result["evaluation"])  # Judge's evaluation

1) Best answer: gpt-5-mini.  
Why: It’s concise, accurate, and adds a useful gameplay detail (how many ore to smelt an Iridium Bar). It covers the main sources and the Skull Cavern depth tip without unnecessary repetition.

2) Brief scores (1-10):
- gpt-4o: 8 — Accurate and well-structured; slightly more verbose but complete.
- gpt-4o-mini: 7 — Similar content to gpt-4o but less polished/concise.
- gpt-5-mini: 9 — Concise, accurate, and includes the practical smelting note.

3) Notable differences:
- gpt-5-mini is the most concise and provides the extra smelting recipe (5 Iridium Ore + 1 Coal → Iridium Bar).
- gpt-4o and gpt-4o-mini are very similar to each other; gpt-4o is more polished in formatting.
- All three list the same primary sources (Iridium Nodes in Skull Cavern/Volcano Dungeon, geodes, monster drops, fishing chests, panning, fish pond, Statue of Perfection).
