# LLM as judge 
This evaluates generated responses against a rubric and provides structured output. 



In [8]:
import pickle
import pandas as pd

with open(r"C:\Users\jonathan.kasprisin\gitlab\DNoK_GraphRAG\output_data_records_graph.pkl", "rb") as input_file:
    graph_data = pickle.load(input_file)

with open(r"C:\Users\jonathan.kasprisin\gitlab\DNoK_GraphRAG\llm_reason_output_data.pkl", "rb") as input_file:
    reason_data = pickle.load(input_file)

print(f"len(graph_data): {len(graph_data)}")
print(f"len(reason_data): {len(reason_data)}")
 
#merge the two lists together
reason_data.extend(graph_data)
print(f"len(reason_data): {len(reason_data)}")
print(reason_data[:2])
# Convert to DataFrame
df_llm_records = pd.DataFrame(reason_data)

len(graph_data): 30
len(reason_data): 18
len(reason_data): 48
[{'index': 0, 'type': 'Basic', 'test_case': 's0r0', 'student': '\n        Background: Recent college graduate with a degree in Business Administration.\n        Strengths: Strong organizational and project management skills.\n        Weaknesses: Limited mathematical background; no prior programming experience.\n        Preferences: Prefers real-world applications, interactive learning, and visualizations.\n        Prior Course History: \n        - Introduction to Business Mathematics\n        - Basic Statistics for Managers\n    ', 'request': 'Help me understand how eigenvalues relate to matrix transformations. Provide content that visually explains this concept and its applications in data analysis.', 'context': '', 'response': '**1. Summary:**\nEigenvalues and eigenvectors are like the unique "fingerprints" of a matrix, describing how it transforms vectors. They\'re crucial in data analysis as they help identify patterns a

In [10]:
display(df_llm_records.tail())

Unnamed: 0,index,type,test_case,student,request,context,response
43,25,GR_kg_no_refine2_0.85,s0r1,\n Background: Recent college graduate ...,Can you explain how vector spaces connect to l...,Keywords: Recent college graduate in Business ...,**1. Summary:**\nVector spaces and linear equa...
44,26,GR_kg_no_refine2_0.85,s1r0,\n Background: Graduate student pursuin...,Help me understand how eigenvalues relate to p...,Keywords: Recent college graduate in Business ...,```python\nimport numpy as np\n\n# Define a po...
45,27,GR_kg_no_refine2_0.85,s1r1,\n Background: Graduate student pursuin...,Explain the significance of matrix decompositi...,Keywords: Recent college graduate in Business ...,```python\n# I will provide a response based o...
46,28,GR_kg_no_refine2_0.85,s2r0,\n Background: Operations Research prof...,Can you provide insights into how singular val...,Keywords: Recent college graduate in Business ...,---\n\n**1. Summary:**\nSingular Value Decompo...
47,29,GR_kg_no_refine2_0.85,s2r1,\n Background: Operations Research prof...,Explain the computational advantages of using ...,Keywords: Recent college graduate in Business ...,---\n\n**1. Summary:**\nSparse matrix techniqu...


In [25]:
from pydantic import BaseModel, Field
from huggingface_hub import InferenceClient
import pandas as pd
import json

# Define the comparison schema
class ComparisonScores(BaseModel):
    winner: str = Field(..., description="Indicates whether Response A or Response B performed better overall")
    reason: str = Field(..., description="Overall reasoning for the ranking decision")
    
    summary_accuracy: str = Field(..., description="Which response had better summary accuracy (A/B/Tie)")
    summary_language: str = Field(..., description="Which response had better STEM-defined term usage (A/B/Tie)")
    summary_usefulness: str = Field(..., description="Which response used more appropriate language for the student level (A/B/Tie)")
    summary_completeness: str = Field(..., description="Which response summarized all material more effectively (A/B/Tie)")
    summary_subjective: str = Field(..., description="Which response used more engaging language (A/B/Tie)")
    summary_rationale: str = Field(..., description="Rationale for summary ranking")
    
    explanation_accuracy_terms: str = Field(..., description="Which response used more appropriate STEM-defined terms (A/B/Tie)")
    explanation_accuracy_vocab: str = Field(..., description="Which response had better content-specific vocabulary (A/B/Tie)")
    explanation_usefulness_objectives: str = Field(..., description="Which response better addressed expected learning objectives (A/B/Tie)")
    explanation_usefulness_definitions: str = Field(..., description="Which response defined key concepts better (A/B/Tie)")
    explanation_usefulness_bloom: str = Field(..., description="Which response better addressed Bloom’s Taxonomy level (A/B/Tie)")
    explanation_usefulness_level: str = Field(..., description="Which response had the correct material for the student level (A/B/Tie)")
    explanation_usefulness_examples: str = Field(..., description="Which response provided clearer examples (A/B/Tie)")
    explanation_usefulness_microlearning: str = Field(..., description="Which response was more suitable for micro-learning (A/B/Tie)")
    explanation_completeness: str = Field(..., description="Which response covered all learning objectives better (A/B/Tie)")
    explanation_references: str = Field(..., description="Which response included better references (A/B/Tie)")
    explanation_examples: str = Field(..., description="Which response had more concrete examples for abstract concepts (A/B/Tie)")
    explanation_subjective_engagement: str = Field(..., description="Which response used more engaging language (A/B/Tie)")
    explanation_subjective_teaching: str = Field(..., description="Which response used better scaffolding techniques (A/B/Tie)")
    explanation_subjective_reflection: str = Field(..., description="Which response provided better reflection opportunities (A/B/Tie)")
    explanation_subjective_motivation: str = Field(..., description="Which response communicated enthusiasm better (A/B/Tie)")
    explanation_rationale: str = Field(..., description="Rationale for explanation ranking")
    
    references_accuracy: str = Field(..., description="Which response directed to real (non-hallucinated) references (A/B/Tie)")
    references_usefulness: str = Field(..., description="Which response offered more appropriate resources for the learner (A/B/Tie)")
    references_rationale: str = Field(..., description="Rationale for reference ranking")
    
    context_accuracy: str = Field(..., description="Which response used more appropriate context (A/B/Tie)")
    context_completeness: str = Field(..., description="Which response better used context to address learning objectives (A/B/Tie)")
    context_rationale: str = Field(..., description="Rationale for context ranking")
    
    comments: str = Field(..., description="Overall comments on the comparison and suggestions for improvement")

SYSTEM_PROMPT = """
You are an evaluator tasked with comparing two student responses based on a structured rubric. 
Assess each response relative to the other and determine which one performs better in each category. 
Provide a rationale for your decisions and indicate the overall stronger response.

Indicate the better response as 'A', 'B', or 'Tie' for each metric.
"""

# Initialize LLM client
class StructuredComparison:
    def __init__(self, endpoint_url: str, output_structure: BaseModel, timeout: int = 300, max_new_tokens: int = 1500, temperature: float = 0.6):
        self.client = InferenceClient(endpoint_url, timeout=timeout)
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.schema = output_structure.model_json_schema()
    
    def invoke(self, prompt: str) -> str:
        response = self.client.text_generation(
            prompt=f"{prompt}\n\nPlease structure the response using the following schema: {json.dumps(self.schema)}",
            max_new_tokens=self.max_new_tokens,
            temperature=self.temperature,
            grammar={"type": "json", "value": self.schema},
        )
        return response

# Initialize comparison instance (update endpoint URL)
endpoint_url = ":8085"
comparison_model = StructuredComparison(endpoint_url, ComparisonScores)

# Load essay pairs (assuming df_llm_records contains two essays per comparison)
pairwise_evals = []
for i in range(0, len(df_llm_records), 2):
    if i + 1 >= len(df_llm_records):
        break  # Ensure pairs exist
    
    row_a, row_b = df_llm_records.iloc[i], df_llm_records.iloc[i+1]
    
    comparison_prompt = f"""
    {SYSTEM_PROMPT}
    
    **Student Profile A:** {row_a["student"]}
    **Student Request A:** {row_a["request"]}
    **Generated Response A:** {row_a["response"]}
    
    **Student Profile B:** {row_b["student"]}
    **Student Request B:** {row_b["request"]}
    **Generated Response B:** {row_b["response"]}
    
    Compare Response A and Response B using the rubric and determine which response is better in each category.
    """
    
    # Get comparison evaluation
    response = comparison_model.invoke(comparison_prompt)
    
    # Parse JSON response
    try:
        eval_result = json.loads(response)
        eval_result["index_A"] = i
        eval_result["index_B"] = i+1
        pairwise_evals.append({
            "student_A": row_a["student"],
            "student_B": row_b["student"],
            "request_A": row_a["request"],
            "request_B": row_b["request"],
            "response_A": row_a["response"],
            "response_B": row_b["response"],
            "comparison": eval_result
        })
    except json.JSONDecodeError:
        print(f"Error parsing JSON for comparison {i}-{i+1}")

# Convert evaluations to a DataFrame
comparison_df = pd.DataFrame(pairwise_evals)

# Save the comparison results
comparison_df.to_csv("comparisons_output.csv", index=False)


In [31]:
#Win count ranking
import pandas as pd

# # Load the comparison results
# comparison_df = pd.read_csv("comparisons_output.csv")

# Dictionary to track wins
essay_scores = {}

# Process each row in the comparison dataframe
for _, row in comparison_df.iterrows():
    winner = row["comparison.winner"]  # "A" or "B"
    essay_A = row["response_A"]
    essay_B = row["response_B"]
    
    # Initialize scores if essay not seen before
    if essay_A not in essay_scores:
        essay_scores[essay_A] = 0
    if essay_B not in essay_scores:
        essay_scores[essay_B] = 0
    
    # Assign win count
    if winner == "A":
        essay_scores[essay_A] += 1
    elif winner == "B":
        essay_scores[essay_B] += 1

# Convert to a sorted list (best to worst)
ranked_essays = sorted(essay_scores.items(), key=lambda x: x[1], reverse=True)

# Display the ranking
for rank, (essay, score) in enumerate(ranked_essays, 1):
 print(f"{rank}. Score: {score} - Essay: {essay[:100]}...")  # Show only first 100 chars

eval_df['evaluation'][0] = {'summary_accuracy': 4, 'summary_language': 4, 'summary_usefulness': 4, 'summary_completeness': 4, 'summary_subjective': 4, 'summary_rationale': 'The summary is concise and accurately addresses the learning objectives. It uses appropriate STEM terms and provides a clear overview of eigenvalues and eigenvectors.', 'explanation_accuracy_terms': 4, 'explanation_accuracy_vocab': 4, 'explanation_usefulness_objectives': 4, 'explanation_usefulness_definitions': 4, 'explanation_usefulness_bloom': 4, 'explanation_usefulness_level': 4, 'explanation_usefulness_examples': 4, 'explanation_usefulness_microlearning': 4, 'explanation_completeness': 4, 'explanation_references': 4, 'explanation_examples': 4, 'explanation_subjective_engagement': 4, 'explanation_subjective_teaching': 4, 'explanation_subjective_reflection': 4, 'explanation_subjective_motivation': 4, 'explanation_rationale': 'The detailed explanation is thorough and uses appropriate STEM terms. It effectively addr

In [32]:
import ast

# Convert 'evaluation' column from string to dictionary (if necessary)
eval_df['evaluation'] = eval_df['evaluation'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract numerical values from 'evaluation' column, excluding keys ending with '_rationale' or 'comments'
def extract_numeric_values(eval_dict):
    return {key: value for key, value in eval_dict.items() 
            if isinstance(value, (int, float)) and not (key.endswith('_rationale') or key == 'comments')}

# Convert 'evaluation' column to a DataFrame with extracted numerical values
numeric_eval_df = eval_df['evaluation'].apply(lambda x: extract_numeric_values(x)).apply(pd.Series)

# Compute mean and variance for each metric
metrics_stats = numeric_eval_df.agg(['mean', 'var'])

# Display the results
display(metrics_stats)


Unnamed: 0,summary_accuracy,summary_language,summary_usefulness,summary_completeness,summary_subjective,explanation_accuracy_terms,explanation_accuracy_vocab,explanation_usefulness_objectives,explanation_usefulness_definitions,explanation_usefulness_bloom,...,explanation_examples,explanation_subjective_engagement,explanation_subjective_teaching,explanation_subjective_reflection,explanation_subjective_motivation,references_accuracy,references_usefulness,context_accuracy,context_completeness,index
mean,3.9375,3.9375,3.854167,3.916667,3.395833,3.958333,3.958333,3.958333,3.958333,3.958333,...,3.958333,3.479167,3.416667,3.270833,3.354167,3.833333,3.8125,3.9375,3.9375,23.5
var,0.1875,0.1875,0.254876,0.205674,0.371897,0.083333,0.083333,0.083333,0.083333,0.083333,...,0.083333,0.297429,0.29078,0.286791,0.276152,0.269504,0.283245,0.1875,0.1875,196.0
