In [None]:
import pandas as pd
from openai import OpenAIError, AsyncOpenAI
from tenacity import (retry,
                      stop_after_attempt,
                      wait_exponential,
                      retry_if_exception_type,
                      before_sleep_log,
                      after_log)
from pydantic import BaseModel, Field
from typing import List, Tuple
from config import OPENAI_API_KEY
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)


class GenText(BaseModel):
    text: str = Field(description="The text result of the query")


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    retry=retry_if_exception_type(OpenAIError),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    after=after_log(logger, logging.INFO)
)
async def async_response_openai(
    user_prompt,
    model: str = 'gpt-4o-mini',
    system_prompt: str="You are a helpful assistant.",
    response_model: BaseModel = GenText,
    temperature=0.1) -> BaseModel:
    response = await async_openai_client.responses.parse(
        model=model,
        input=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=temperature,
        text_format=response_model,
    )
    return response.output_parsed


@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    retry=retry_if_exception_type(OpenAIError),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    after=after_log(logger, logging.INFO)
)
async def async_embed_text(
    text: str,
    model: str = 'text-embedding-3-large'
) -> List[float]:
    response = await async_openai_client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

## Load texts

In [None]:
import os

root_dir = "book"   # top-level book directory

book_data = []
for chapter, subdirs, files in os.walk(root_dir):
    if chapter == root_dir:
        continue
    # print(f"\nðŸ“‚ Chapter: {chapter} ")
    chapter_number = int(chapter.split('\\')[-1].split('__')[0])
    chapter_title = chapter.split('\\')[-1].split('__')[-1]

    for file in files:
        # print(f"$ {file}")
        subchapter_number = file.split('__')[0]
        subchapter_title = file.split('__')[1]
        subchapter_page = int(file.split('__')[-1].split('.')[0])
        
        if file.endswith(".txt"):
            filepath = os.path.join(chapter, file)
            with open(filepath, "r", encoding="utf-8") as f:
                # print(f"\n--- {file} ---")
                subchapter_text = f.read()
            book_data.append(
                {
                    "chapter_number": chapter_number,
                    "chapter_title": chapter_title,
                    "subchapter_number": subchapter_number,
                    "subchapter_title": subchapter_title,
                    "subchapter_page": subchapter_page,
                    "subchapter_text": subchapter_text
                }
            )


In [None]:
# Export segmented book to parquet
df = pd.DataFrame(book_data)
df.to_parquet("book.parquet", index=False)

## Embeddings

In [None]:
df_partition = df[df['chapter_number'] == 1]

In [None]:
text_list = df_partition.subchapter_text.to_list()


In [None]:
import asyncio

tasks = [
    async_embed_text(text=text)
    for text in text_list
    ]

embeddings = await asyncio.gather(*tasks, return_exceptions=True)

In [None]:
df_partition_embeddings = df_partition.assign(embedding=embeddings)
df_partition_embeddings.to_parquet('book_partition_1.parquet', index=False)

## Retrieval

In [None]:
import numpy as np

def euclidean_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Calculate the Euclidean distance between two equal-length NumPy vectors."""
    vec1 = np.asarray(vec1, dtype=float)
    vec2 = np.asarray(vec2, dtype=float)

    if vec1.shape != vec2.shape:
        raise ValueError("Vectors must have the same shape.")

    return np.linalg.norm(vec1 - vec2)


df_rag = pd.read_parquet('book_partition_1.parquet')

In [None]:
question = """
Cutting a midsagittal section through the body separates which 
parts of the body?
 a. anterior and posterior portions
 b. superior and inferior portions
 c. proximal and distal portions
 d. right and left halves
"""

student_answer = "c. proximal and distal portions"

retrieval_top_k = 1

In [None]:
question_text_embedding = await async_embed_text(text=question)

In [None]:
def compute_distances(
        df_rag: pd.DataFrame, 
        query_embedding: list | np.ndarray,
        ) -> pd.DataFrame:
    embeddings = np.vstack(df_rag["embedding"].to_numpy())
    query = np.array(query_embedding)

    df_rag["distance"] = np.linalg.norm(embeddings - query, axis=1)
    return df_rag


def retrieve_top_k(
        df_rag: pd.DataFrame,
        query_embedding: list | np.ndarray,
        top_k: int = 3
        ) -> pd.DataFrame:
    df_rag_distance = compute_distances(df_rag, query_embedding)
    df_rag_ranked = df_rag_distance.sort_values(
        'distance').reset_index().iloc[:top_k]
    return df_rag_ranked

In [None]:
df_rag_ranked = retrieve_top_k(
    df_rag=df_rag, 
    query_embedding=question_text_embedding,
    top_k=retrieval_top_k)

In [None]:
df_rag_ranked
retrieved_pages = df_rag_ranked['subchapter_page'].values
retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)
retrieved_text


## LLM feedback

In [None]:
class Prompts:
    @classmethod
    async def feedback(
        cls,
        question: str,
        student_answer: str,
        retrieved_text: str,
    ) -> Tuple[str, str]:

        system_prompt = """
        You are an expert anatomy tutor providing feedback to a medical student.
        Your feedback must be grounded ONLY in the retrieved anatomy text provided.
        
        RULES:
        - Do NOT add information not present in the retrieved text.
        - If the retrieved text does not contain the answer, say so.
        - Provide feedback that is accurate, concise, and educational.
        - Highlight what is correct, what is incorrect, and provide the correct info (only if found in the text).
        - Use a supportive and encouraging tone.
        - Do NOT mention the rules to the student.
        """

        user_prompt = f"""
        Student Question:
        {question}

        Student Answer:
        {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        Using ONLY the retrieved text, provide feedback in the following structured format:

        **Feedback**
        - **Accuracy**: Assess the correctness of the student's answer based only on the retrieved text.
        - **Correct Information**: Provide the accurate information from the text (only if available).
        - **Improvement Tip**: Give one short tip to improve their understanding.

        If the retrieved text does not include enough information to evaluate the student's answer, respond with:
        "The retrieved text does not contain enough information to evaluate this answer. Please retrieve a more relevant passage."
        """

        return system_prompt, user_prompt
    
    @classmethod
    async def feedback_minimal(cls, question: str, student_answer: str, retrieved_text: str):
        system_prompt = """
        You are an expert anatomy tutor. You must provide feedback to the student using ONLY the retrieved text.
        
        RULES:
        - Do not add any information not found in the retrieved text.
        - If the text does not contain enough info, explicitly say so.
        - Be concise, supportive, and student-friendly.
        """

        user_prompt = f"""
        Question: {question}
        Student Answer: {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        Provide feedback using only this structure:

        **Feedback**
        - **Accuracy**: (Is the student correct based only on the text?)
        - **Correct Information**: (State the correct info if in the text.)
        - **Improvement Tip**: (Short + practical)
        """

        return system_prompt, user_prompt
    

    @classmethod
    async def feedback_scored(cls, question: str, student_answer: str, retrieved_text: str):
        system_prompt = """
        You are an expert anatomy tutor. Evaluate the student's answer ONLY using the retrieved text.
        Provide a score from 0â€“5 representing accuracy based solely on the retrieved text.

        SCORING GUIDE:
        5 - Completely correct
        4 - Mostly correct, minor detail missing
        3 - Partially correct, key details missing
        2 - Some correct elements but mostly incorrect
        1 - Minimally correct
        0 - Completely incorrect or unrelated

        RULES:
        - No external knowledge beyond the retrieved text.
        - If insufficient evidence, state so instead of scoring.
        """

        user_prompt = f"""
        Question: {question}
        Student Answer: {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        Provide feedback using this structure:

        **Score**: X/5
        **Feedback**
        - **Reasoning**: (Brief justification based only on the text)
        - **Correct Information**: (Only if contained in the text)
        - **Improvement Tip**: (1 suggestion to improve)
        """

        return system_prompt, user_prompt
    
    @classmethod
    async def feedback_hidden_reasoning(cls, question: str, student_answer: str, retrieved_text: str):
        system_prompt = """
        You are an expert anatomy tutor. You may think step-by-step to reach the answer,
        but do NOT reveal your reasoning. Only output the final formatted feedback.

        Use ONLY the retrieved text. Do not hallucinate.

        If the retrieved text is insufficient, state so.
        """

        user_prompt = f"""
        Question: {question}
        Student Answer: {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        THINK STEP-BY-STEP PRIVATELY, then provide ONLY this in your final answer:

        **Feedback**
        - **Accuracy**
        - **Correct Information**
        - **Improvement Tip**
        """

        return system_prompt, user_prompt
    

    @classmethod
    async def feedback_universal(
        cls,
        question: str,
        student_answer: str,
        retrieved_text: str,
        mode: str = "minimal"
    ):

        system_prompt = f"""
        You are an expert anatomy tutor. Provide feedback to the student based ONLY on the retrieved text.
        Do not add information not present in the text. If the text does not provide enough info, say so.

        MODE BEHAVIOR:
        - minimal â†’ concise student-friendly feedback
        - scored â†’ include a 0â€“5 accuracy score and justification
        - hidden_reasoning â†’ may reason privately, but output must not show reasoning
        """

        user_prompt = f"""
        MODE: {mode}

        Question: {question}
        Student Answer: {student_answer}

        Retrieved Text (source of truth):
        {retrieved_text}

        Follow the mode rules:

        minimal:
        **Feedback**
        - **Accuracy**
        - **Correct Information**
        - **Improvement Tip**

        scored:
        **Score**: X/5
        **Feedback**
        - **Reasoning**
        - **Correct Information**
        - **Improvement Tip**

        hidden_reasoning:
        THINK STEP-BY-STEP PRIVATELY, then provide ONLY:
        **Feedback**
        - **Accuracy**
        - **Correct Information**
        - **Improvement Tip**
        """

        return system_prompt, user_prompt


In [None]:
system_prompt, user_prompt = await Prompts.feedback(
    question=question,
    student_answer=student_answer,
    retrieved_text=retrieved_text
    )

In [None]:
response = await async_response_openai(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    model = 'gpt-4o-mini',
    response_model=GenText,
    temperature=0.0001
)

In [None]:
response.text