In [1]:
import pandas as pd

In [2]:
# Sample manual Q/A data defined in a list of dictionaries
qa_data = [
    {
        "ID": 1,
        "Question": "What are the key issues with the x-ray bone fracture detection dataset?",
        "Ground Truth Answer": "1) Most of the images are healthy without fracture, 2) In validation set this is opposite, most of the images are with fracture, 3) Enriched by offline augmentation so the number of original images are much smaller, 4) The box sizes tend to be tiny, and the val/test fractures are even smaller than the training fractures."
    },
    {
        "ID": 2,
        "Question": "What are the issues with the annotations in the bone fracture detection dataset?",
        "Ground Truth Answer": "The box sizes tend to be tiny, and the val/test fractures are even smaller than the training fractures. Fractures are annotated with bounding boxes, often covering only a few percent of the image."
    },
    {
        "ID": 3,
        "Question": "What was the impcat of training larger networks on the bone fracture detection dataset?",
        "Ground Truth Answer": "Larger networks did not really help. Their mAP@50 bounced around by a point or two, while the strict mAP@50 to 95 never moved."
    },
    {
        "ID": 4,
        "Question": "What is the bottleneck for Yolo on bone fracture detcteion dataset?",
        "Ground Truth Answer": "The bottleneck is spatial resolution and center point assignment"
    },
    {
        "ID": 5,
        "Question": "What should I consider when training yolo on an x-ray bone fracture dataset?",
        "Ground Truth Answer": "1) Check your dataset splits first, 2) Size matters. Small objects require higher resolution, 3) Loss weights help, but they won’t save you if the detector can’t physically resolve the object, 4) Medical images ≠ street photos. Expect data hunger, imbalance, and tiny targets."
    }

]


In [3]:
# Convert to DataFrame
qa_df = pd.DataFrame(qa_data)

# Save to CSV
csv_path = "validation_qa_dataset.csv"
qa_df.to_csv(csv_path, index=False)

Retrieval Evaluation (ChromaDB/Vector Search Quality)

In [20]:
#from langchain.embeddings import OpenAIEmbeddings
import os
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.docstore.document import Document
from langchain.vectorstores.base import VectorStoreRetriever
from sklearn.metrics.pairwise import cosine_similarity
from langchain.chains import RetrievalQA
from fuzzywuzzy import fuzz
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv


In [21]:
def rag_evaluation(matching_mode='exact',k=1):

    load_dotenv()

    # --- Parameters ---
    #matching_mode = "semantic"  # Options: "exact", "fuzzy", "semantic"
    k=k
    fuzzy_threshold = 60
    semantic_threshold = 0.80
    csv_path = "validation_qa_dataset.csv"

    # --- Setup Azure OpenAI model and embeddings ---
    llm = AzureChatOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version="2023-05-15",
        temperature=0
    )

    embeddings = AzureOpenAIEmbeddings(
        azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15"
    )

    # --- Connect retriever (Chroma or Azure) ---
    vector_db_type = os.getenv("VECTOR_DB_TYPE", "azure").lower()

    if vector_db_type == "chroma":
        from langchain.vectorstores import Chroma
        retriever = Chroma(persist_directory="../chroma_db", embedding_function=embeddings).as_retriever()
    elif vector_db_type == "azure":
        from langchain_community.vectorstores.azuresearch import AzureSearch
        retriever = AzureSearch(
            azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
            azure_search_key=os.getenv("AZURE_SEARCH_ADMIN_KEY"),
            index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
            embedding_function=embeddings.embed_query
        ).as_retriever()
    else:
        raise ValueError("Unsupported VECTOR_DB_TYPE")

    
    retriever.search_kwargs = {"k": k}
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    # --- Matching Functions ---
    def is_exact_match(a, b):
        return a.strip().lower() == b.strip().lower(), None

    def is_fuzzy_match(a, b):
        score = fuzz.partial_ratio(a.lower(), b.lower())
        return score >= fuzzy_threshold, score

    def is_semantic_match(a, b):
        a_emb = np.array(embeddings.embed_query(a)).reshape(1, -1)
        b_emb = np.array(embeddings.embed_query(b)).reshape(1, -1)
        score = cosine_similarity(a_emb, b_emb)[0][0]
        return score >= semantic_threshold, score

    match_func = {
        "exact": is_exact_match,
        "fuzzy": is_fuzzy_match,
        "semantic": is_semantic_match
    }[matching_mode]

    # --- Load Evaluation Dataset ---
    df = pd.read_csv(csv_path)
    results = []

    # --- Run Evaluation ---
    for _, row in df.iterrows():
        question = row["Question"]
        ground_truth = row["Ground Truth Answer"]
        
        try:
            response = qa_chain.invoke({"query": question})
            model_answer = response["result"]

            is_match, score = match_func(ground_truth, model_answer)
            precision = 1.0 if is_match else 0.0
            recall = 1.0 if is_match else 0.0

            results.append({
                "Question": question,
                "Ground Truth Answer": ground_truth,
                "Model Answer": model_answer,
                "Match": is_match,
                f"Precision@{k}": precision,
                f"Recall@{k}": recall,
                "Score":round(score, 4)
            })
        except Exception as e:
            results.append({
                "Question": question,
                "Ground Truth Answer": ground_truth,
                "Model Answer": f"ERROR: {str(e)}",
                "Match": False,
                f"Precision@{k}": 0.0,
                f"Recall@{k}": 0.0,
                "Score":0.0
            })

    # --- Save & Print Summary ---
    result_df = pd.DataFrame(results)
    avg_precision = result_df[f"Precision@{k}"].mean()
    avg_recall = result_df[f"Recall@{k}"].mean()
    avg_score = result_df[f"Score"].mean()

    print(f"\n📊 Evaluation using '{matching_mode}' matching")
    print(f"🔹 Avg Precision@{k}: {avg_precision:.2f}")
    print(f"🔹 Avg Recall@{k}: {avg_recall:.2f}")
    print(f"🔹 Avg Score: {avg_score:.2f}")

    result_df.to_csv(f"rag_evaluation_{matching_mode}.csv", index=False)


In [None]:
def rag_evaluation(matching_mode='exact', k=1):
    
    
    load_dotenv()

    fuzzy_threshold = 60
    semantic_threshold = 0.80
    csv_path = "validation_qa_dataset.csv"

    # Azure OpenAI setup
    llm = AzureChatOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version="2023-05-15",
        temperature=0
    )
    embeddings = AzureOpenAIEmbeddings(
        azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15"
    )

    # Retriever setup
    vector_db_type = os.getenv("VECTOR_DB_TYPE", "azure").lower()
    if vector_db_type == "chroma":
        from langchain.vectorstores import Chroma
        retriever = Chroma(persist_directory="../chroma_db", embedding_function=embeddings).as_retriever()
    elif vector_db_type == "azure":
        from langchain_community.vectorstores.azuresearch import AzureSearch
        retriever = AzureSearch(
            azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
            azure_search_key=os.getenv("AZURE_SEARCH_ADMIN_KEY"),
            index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
            embedding_function=embeddings.embed_query
        ).as_retriever()
    else:
        raise ValueError("Unsupported VECTOR_DB_TYPE")

    retriever.search_kwargs = {"k": k}
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    # Match functions
    def is_exact_match(a, b):
        return a.strip().lower() == b.strip().lower(), None

    def is_fuzzy_match(a, b):
        score = fuzz.partial_ratio(a.lower(), b.lower())
        return score >= fuzzy_threshold, score

    def is_semantic_match(a, b):
        a_emb = np.array(embeddings.embed_query(a)).reshape(1, -1)
        b_emb = np.array(embeddings.embed_query(b)).reshape(1, -1)
        score = cosine_similarity(a_emb, b_emb)[0][0]
        return score >= semantic_threshold, score

    def is_gpt_judged_match(question, ground_truth, model_answer):

        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        
        prompt = f"""
                    You are an expert evaluator. Rate the model's answer on a scale from 1 to 5:

                    - 5 = Excellent: Fully correct, complete, and informative.
                    - 4 = Good: Mostly correct with minor flaws.
                    - 3 = Fair: Partially correct or incomplete.
                    - 2 = Poor: Mostly incorrect.
                    - 1 = Bad: Completely wrong or irrelevant.

                    Question: {question}
                    Ground Truth Answer: {ground_truth}
                    Model Answer: {model_answer}

                    Score (1-5) and brief explanation:
                    """
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            reply = response.choices[0].message.content
            score_line = next((line for line in reply.splitlines() if any(c.isdigit() for c in line)), "Score: 3")
            score = int([s for s in score_line if s.isdigit()][0])
            explanation = reply.strip()
        except Exception as e:
            score = 0
            explanation = f"Error: {str(e)}"

        return score >= 4, score, explanation  # score 4–5 is considered a match

    if matching_mode != "gpt":
        match_func = {
            "exact": is_exact_match,
            "fuzzy": is_fuzzy_match,
            "semantic": is_semantic_match
        }[matching_mode]

    # Load data
    df = pd.read_csv(csv_path)
    results = []

    for _, row in df.iterrows():
        question = row["Question"]
        ground_truth = row["Ground Truth Answer"]
        try:
            response = qa_chain.invoke({"query": question})
            model_answer = response["result"]

            if matching_mode == "gpt":
                is_match, score, explanation = is_gpt_judged_match(question, ground_truth, model_answer)
            else:
                is_match, score = match_func(ground_truth, model_answer)
                explanation = ""

            results.append({
                "Question": question,
                "Ground Truth Answer": ground_truth,
                "Model Answer": model_answer,
                "Match": is_match,
                f"Precision@{k}": 1.0 if is_match else 0.0,
                f"Recall@{k}": 1.0 if is_match else 0.0,
                "Score": round(score, 4) if isinstance(score, float) else score,
                "Evaluation": explanation
            })

        except Exception as e:
            results.append({
                "Question": question,
                "Ground Truth Answer": ground_truth,
                "Model Answer": f"ERROR: {str(e)}",
                "Match": False,
                f"Precision@{k}": 0.0,
                f"Recall@{k}": 0.0,
                "Score": 0,
                "Evaluation": f"Exception during evaluation: {e}"
            })

    # Summary
    result_df = pd.DataFrame(results)
    avg_precision = result_df[f"Precision@{k}"].mean()
    avg_recall = result_df[f"Recall@{k}"].mean()
    avg_score = result_df["Score"].mean()

    print(f"\n📊 Evaluation using '{matching_mode}' matching")
    print(f"🔹 Avg Precision@{k}: {avg_precision:.2f}")
    print(f"🔹 Avg Recall@{k}: {avg_recall:.2f}")
    print(f"🔹 Avg Score: {avg_score:.2f}")

    result_df.to_csv(f"rag_evaluation_{matching_mode}.csv", index=False)


In [None]:
from dotenv import load_dotenv
print("OPENAI_API_KEY" in os.environ)
print(os.getenv("OPENAI_API_KEY"))

True
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


In [13]:
rag_evaluation(matching_mode='gpt', k=1)


📊 Evaluation using 'gpt' matching
🔹 Avg Precision@1: 1.00
🔹 Avg Recall@1: 1.00
🔹 Avg Score: 4.60
