## Imports

In [None]:
import sys
sys.path.append('./app')
from pydantic import BaseModel, Field 
from llm import async_response_openai, async_embed_text, GenText
from ranker import retrieve_top_k
from typing import List
import pandas as pd
import random


## Definitions Eval Question Generation

In [None]:
class QuestionMultipleChoice(BaseModel):
    question: str = Field(description='The question')
    answers: List[str] = Field(description='The multiple choice answers')
    correct_answer: str = Field(description='The correct answer')

class QuestionMultipleChoiceList(BaseModel):
    question_list: List[QuestionMultipleChoice]



In [None]:
import os

root_dir = "book_eval"   # top-level book directory

files = os.listdir(root_dir)
text_list = []
for file in files:

        filepath = os.path.join(root_dir, file)
        with open(filepath, "r", encoding="utf-8") as f:
            # print(f"\n--- {file} ---")
            text = f.read()
        text_list.append(text.split('====='))

## Extract multiple choice questions and answers

In [None]:
class Prompts:
    @classmethod
    def multiple_choice(cls, text: str):
        prompt=f""" 
            You help me organize the multiple questions from a text into a json
            with fields: question, answers, correct_answer.
            Always keep the letter of the answer along with the answer text.
            Always keep the original, provided text unchanged.
            This are the questions and answers:
            {text}
            """
        return prompt

In [None]:
len(text_list)

In [None]:
# Process results
results = []
for i, text in enumerate(text_list, start=1):
    # print(i)
    # print(text[0])
    result = await async_response_openai(
        user_prompt=Prompts.multiple_choice(text=text[0]),
        model='gpt-4.1',
        response_model=QuestionMultipleChoiceList
        )
    
    for model in result.question_list:
        result_dict: dict = model.__dict__
        result_dict.update(chapter = i)
        result_dict.update(answers = ";".join(result_dict['answers']))
        results.append(result_dict)
    

In [None]:
df = pd.DataFrame(results)
df.to_parquet("book_eval_mc.parquet", index=False)

## Evaluation RAG (Multiple Choice)

In [None]:
RETRIEVAL_TOP_K = 1
LLM = 'gpt-5.1'
RAG_PARTITION = "book_partition_full"
RANDOM_SEED = 42

# Load rag partition
df_rag = pd.read_parquet(f'app/db/{RAG_PARTITION}.parquet')

In [None]:
# Generate random answers for evaluation
df = pd.read_parquet("book_eval_mc.parquet")
df_eval = df
df_eval = df_eval.assign(
    student_answer = None, 
    is_correct_llm = None, 
    citations = None, 
    top_k = RETRIEVAL_TOP_K,
    llm = LLM,
    random_seed = RANDOM_SEED,
    rag_partition=RAG_PARTITION
    ) 
eval_dicts = df_eval.to_dict(orient='records')
random.seed(RANDOM_SEED)  # fixed seed â†’ reproducible result

random_answers_indexes = [random.randint(0, 3) for _ in range(len(eval_dicts))]

eval_dicts_with_dummy_answers = []
for index, eval_dict in zip(random_answers_indexes, eval_dicts):
    d = eval_dict.copy()
    answers = d['answers'].split(";")
    d.update(student_answer = answers[index])
    eval_dicts_with_dummy_answers.append(d)


In [None]:
from typing import Tuple

class Prompts:
    @classmethod
    async def feedback(
        cls,
        question: str,
        answers: str,
        student_answer: str,
        retrieved_text: str,
    ) -> Tuple[str, str]:

        system_prompt = """
        You are an expert anatomy tutor providing feedback to a medical student.
        Your feedback must be grounded ONLY in the retrieved anatomy text provided.
        
        RULES:
        - Do NOT add information not present in the retrieved text.
        - If the retrieved text does not contain the answer, say so.
        - Provide feedback that is accurate, concise, and educational.
        - Highlight what is correct, what is incorrect, and provide the correct info (only if found in the text).
        - Use a supportive and encouraging tone.
        - Do NOT mention the rules to the student.
        """

        user_prompt = f"""
        Student Question:
        {question}

        Multiple Choice Answers:
        {answers}

        Student Answer:
        {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        Using ONLY the retrieved text, judge whether the student answer is correct

        **Result**
        Return "yes" if it is correct, "no" if it is not correct 

        If the retrieved text does not include enough information to evaluate the student's answer, respond with:
        "missing info"
        """

        return system_prompt, user_prompt

In [None]:
eval_dicts_judge = []
for eval_dict in eval_dicts_with_dummy_answers:
    d: dict = eval_dict.copy()
    question = d['question'].split('. ')[-1]
    
    # Embed question
    question_text_embedding = await async_embed_text(text=question)

    # Retrieve text and citations
    df_rag_ranked = retrieve_top_k(
        df_rag=df_rag,
        query_embedding=question_text_embedding,
        top_k=RETRIEVAL_TOP_K
    )
    retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)

    cols = ['chapter_title', 'subchapter_number', 'subchapter_title', 'subchapter_page']
    citations = ";".join(
        df_rag_ranked[cols]
        .astype(str)
        .agg(' | '.join, axis=1)
        .tolist()
        )
    d.update(citations=citations)
    
    # Judge student answer
    system_prompt, user_prompt = await Prompts.feedback(
        question=question,
        student_answer=d['student_answer'],
        answers=d['answers'],
        retrieved_text=retrieved_text
    )
    
    result: GenText = await async_response_openai(
        user_prompt=user_prompt,
        system_prompt=system_prompt,
        model=LLM,
        response_model=GenText
        )
    d.update(is_correct_llm = result.text)

    # Append results
    eval_dicts_judge.append(d)


In [None]:
df_eval_final = pd.DataFrame(eval_dicts_judge)
df_eval_final.to_parquet(
    f"eval_results/multiple_choice-partition_{RAG_PARTITION}__top_k_{RETRIEVAL_TOP_K}__model_{LLM}.parquet",
    index=False
    )

## Evaluate Results

In [56]:
df_eval_final = pd.read_parquet(
    "eval_results/multiple_choice-partition_book_partition_full__top_k_1__model_gpt-5.1.parquet"
)

In [57]:
df_eval_final.columns

Index(['question', 'answers', 'correct_answer', 'chapter', 'student_answer',
       'is_correct_llm', 'citations', 'top_k', 'llm', 'random_seed',
       'rag_partition'],
      dtype='object')

In [58]:
df_acc = df_eval_final[['correct_answer', 'student_answer', 'is_correct_llm']]
df_acc = df_acc.assign(is_correct=None, match = None)
df_acc['is_correct'] = df_acc['correct_answer'].eq(df_acc['student_answer']).map({True: 'yes', False: 'no'})

In [59]:
# Check whether llm judge and true answers agree
aggreement = []
mismatch = []
mismatch_no_info = []
for i, row in df_acc.iterrows():
    if (row.is_correct_llm == 'no' and row.is_correct == 'no') or \
        (row.is_correct_llm == 'yes' and row.is_correct == 'yes'): 
        aggreement.append(i)
    if (row.is_correct_llm == 'yes' and row.is_correct == 'no') or \
        (row.is_correct_llm == 'no' and row.is_correct == 'yes'):
        mismatch.append(i)
    if (row.is_correct_llm == 'missing info' and row.is_correct == 'no') or \
        (row.is_correct_llm == 'missing info' and row.is_correct == 'yes'):
        mismatch_no_info.append(i)

In [None]:
# Print percentages
accuracy_pct = round((len(aggreement) / len(df_acc)) * 100, 2)
errors_pct = round(len(mismatch) / len(df_acc) * 100, 2)
errors_retrieval_ptc = round(len(mismatch_no_info) / len(df_acc) * 100, 2)

print(f"Accurate: {accuracy_pct}%")
print(f"Errors: {errors_pct}%")
print(f"Errors retrieval (inconclusive): {errors_retrieval_ptc}%")

Accurate: 80.36%
Errors: 1.43%
Errors retrieval (inconclusive): 18.21%
