# LLM as judge 
This evaluates generated responses against a rubric and provides structured output. 



In [None]:
import pickle
import pandas as pd

with open(r".\output_data_records_graph.pkl", "rb") as input_file:
    graph_data = pickle.load(input_file)

with open(r".\llm_reason_output_data.pkl", "rb") as input_file:
    reason_data = pickle.load(input_file)

print(f"len(graph_data): {len(graph_data)}")
print(f"len(reason_data): {len(reason_data)}")
 
#merge the two lists together
reason_data.extend(graph_data)
print(f"len(reason_data): {len(reason_data)}")
print(reason_data[:2])
# Convert to DataFrame
df_llm_records = pd.DataFrame(graph_data) #pd.DataFrame(reason_data)


len(graph_data): 2
len(reason_data): 18
len(reason_data): 20
[{'index': 0, 'type': 'Basic', 'test_case': 's0r0', 'student': '\n        Background: Recent college graduate with a degree in Business Administration.\n        Strengths: Strong organizational and project management skills.\n        Weaknesses: Limited mathematical background; no prior programming experience.\n        Preferences: Prefers real-world applications, interactive learning, and visualizations.\n        Prior Course History: \n        - Introduction to Business Mathematics\n        - Basic Statistics for Managers\n    ', 'request': 'Help me understand how eigenvalues relate to matrix transformations. Provide content that visually explains this concept and its applications in data analysis.', 'context': '', 'response': '**1. Summary:**\nEigenvalues and eigenvectors are like the unique "fingerprints" of a matrix, describing how it transforms vectors. They\'re crucial in data analysis as they help identify patterns an

In [19]:
display(df_llm_records.tail())

Unnamed: 0,index,type,test_case,student,request,context,response
0,0,GR_kg_no_refine3_0.85,s1r0,\n Background: Graduate student pursuin...,Help me understand how eigenvalues relate to p...,Keywords: Explanation of the relationship betw...,---\n\n**1. Summary:**\nEigenvalues of a posit...
1,1,GR_kg_no_refine3_0.75,s1r0,\n Background: Graduate student pursuin...,Help me understand how eigenvalues relate to p...,Keywords: Recent college graduate in Business ...,```python\nimport numpy as np\nimport matplotl...


In [20]:
display(df_llm_records['response'][0])

'---\n\n**1. Summary:**\nEigenvalues of a positive definite matrix are all positive, which is a key characteristic that helps define positive definiteness. We\'ll explore this relationship using a Python-based example to illustrate the concept.\n\n**2. Detailed Explanation:**\n\nPositive definite matrices play a crucial role in various fields, including optimization, machine learning, and statistics. To understand how eigenvalues relate to positive definite matrices, let\'s first recall the definition of a positive definite matrix:\n\nA symmetric matrix A is positive definite if, for any non-zero vector v, the inequality v^T * A * v > 0 holds true.\n\nNow, let\'s consider the relationship between eigenvalues and positive definite matrices. The eigenvalues of a positive definite matrix A are all real and positive. This is because the quadratic form v^T * A * v can be expressed as a sum of squares of the eigenvalues, each multiplied by the corresponding eigenvector component. Since the e

In [None]:
from pydantic import BaseModel, Field
from huggingface_hub import InferenceClient
import json
import ast
import tqdm as tqdm

# Define the detailed evaluation schema based on the rubric
class EvaluationScores(BaseModel):
    summary_accuracy: int = Field(..., description="I.A.1: Correctly summarizes material to address learning objectives (1-4)")
    summary_language: int = Field(..., description="I.A.2: STEM-defined terms are used correctly (1-4)")
    summary_usefulness: int = Field(..., description="I.B.1: Uses appropriate language for the student level (1-4)")
    summary_completeness: int = Field(..., description="I.C.1: Summarizes all material from the Detailed Explanation (1-4)")
    summary_subjective: int = Field(..., description="I.D.1: Uses engaging language (1-4)")
    summary_rationale: str = Field(..., description="Two sentences explaining the rationale for the summary scores")

    explanation_accuracy_terms: int = Field(..., description="II.A.1: Uses appropriate STEM-defined terms (1-4)")
    explanation_accuracy_vocab: int = Field(..., description="II.A.2: Content-specific vocabulary is used (1-4)")
    explanation_usefulness_objectives: int = Field(..., description="II.B.1: Addresses expected learning objectives (1-4)")
    explanation_usefulness_definitions: int = Field(..., description="II.B.2: Defines key concepts (1-4)")
    explanation_usefulness_bloom: int = Field(..., description="II.B.3: Addresses Bloom’s Taxonomy level (1-4)")
    explanation_usefulness_level: int = Field(..., description="II.B.4: Correct material for student level (1-4)")
    explanation_usefulness_examples: int = Field(..., description="II.B.5: Clear, understandable examples (1-4)")
    explanation_usefulness_microlearning: int = Field(..., description="II.B.6: Can be consumed as micro-learning (~7-15 mins) (1-4)")
    explanation_completeness: int = Field(..., description="II.C.1: Covers all learning objectives (1-4)")
    explanation_references: int = Field(..., description="II.C.2: Includes specific references for each learning objective (1-4)")
    explanation_examples: int = Field(..., description="II.C.5: Includes concrete examples for abstract concepts (1-4)")
    explanation_subjective_engagement: int = Field(..., description="II.D.1: Uses engaging language for learners (1-4)")
    explanation_subjective_teaching: int = Field(..., description="II.D.2: Uses scaffolding/differentiation/prior knowledge (1-4)")
    explanation_subjective_reflection: int = Field(..., description="II.D.3: Provides opportunities for reflection and closure (1-4)")
    explanation_subjective_motivation: int = Field(..., description="II.D.4: Communicates enthusiasm for learning (1-4)")
    explanation_rationale: str = Field(..., description="Two sentences explaining the rationale for the explanation scores")

    references_accuracy: int = Field(..., description="III.A.1: Directs to real (non-hallucinated) references (1-4)")
    references_usefulness: int = Field(..., description="III.B.1: Offers resources appropriate for learner level (1-4)")
    references_rationale: str = Field(..., description="Two sentences explaining the rationale for the references scores")

    context_accuracy: int = Field(..., description="IV.A.1: Uses correct/appropriate context (1-4)")
    context_completeness: int = Field(..., description="IV.C.1: Uses context to address learning objectives (1-4)")
    context_rationale: str = Field(..., description="Two sentences explaining the rationale for the context scores")

    comments: str = Field(..., description="Overall comments on evaluation and suggestions for improvement")


SYSTEM_PROPMT_w_RUBRIC = """
You are a strict evaluator tasked with assessing student responses based on their background and request. Use the provided rubric to assign scores and provide constructive feedback. Each response should be scored on a 1-4 Likert Scale according to the defined criteria.

### **Evaluation Rubric**
Each item is assessed using a 1-4 Likert scale:
- **4** - Strongly Agree / Meets Expectations 
- **3** - Somewhat Agree / Mostly Meets Expectations 
- **2** - Somewhat Disagree / Partially Meets Expectations 
- **1** - Strongly Disagree / Does Not Meet Expectations 

The rubric evaluates four key response areas:
1. **Summary** - Overall summary of the content.
2. **Detailed Explanation** - Depth and clarity of the response.
3. **References** - Use of accurate and relevant sources.
4. **Context** - Appropriateness and alignment with the learning objectives.

Each response is evaluated based on the following four dimensions:

- **Accuracy** – Presents correct information and terminology with no hallucinations.
- **Usefulness** – Appropriately addresses learning objectives and applies applicable reference material.
- **Completeness** – Covers all objectives outlined in the prompt.
- **Subjective Measures** – Uses engaging, pedagogically sound language to motivate learners.

---

### **Evaluation Criteria**

#### **Summary**
- **Accuracy**  
  - I.A.1: Correctly summarizes material to address learning objectives.
  - I.A.2: Uses appropriate STEM-defined terms.
- **Usefulness**  
  - I.B.1: Uses language appropriate for the student level (Beginner/Intermediate/Advanced).
- **Completeness**  
  - I.C.1: Summarizes all key material from the Detailed Explanation.
- **Subjective Measures**  
  - I.D.1: Uses engaging and accessible language.

---

#### **Detailed Explanation**
- **Accuracy**  
  - II.A.1: Uses correct STEM-defined terms.  
  - II.A.2: Incorporates content-specific vocabulary.  
- **Usefulness**  
  - II.B.1: Addresses all expected learning objectives.  
  - II.B.2: Defines key concepts clearly.  
  - II.B.3: Aligns with Bloom’s Taxonomy (explicitly or implicitly).  
  - II.B.4: Appropriate for student’s level.  
  - II.B.5: Provides clear, understandable examples.  
  - II.B.6: Can be consumed as micro-learning (~7-15 mins).  
- **Completeness**  
  - II.C.1: Covers all relevant learning objectives.  
  - II.C.2: Includes references for each learning objective.  
  - II.C.5: Includes concrete examples for abstract concepts.  
- **Subjective Measures**  
  - II.D.1: Uses engaging language.  
  - II.D.2: Applies scaffolding and differentiation techniques.  
  - II.D.3: Encourages reflection and closure.  
  - II.D.4: Communicates enthusiasm and motivates learning.  

---

#### **References**
- **Accuracy**  
  - III.A.1: Directs to real (non-hallucinated) sources.  
- **Usefulness**  
  - III.B.1: Resources are appropriate for learner level.  
- **Rationale**  
  - Provide 2 sentences explaining reference choices.  

---

#### **Context**
- **Accuracy**  
  - IV.A.1: Uses appropriate context.  
- **Completeness**  
  - IV.C.1: Context aligns with learning objectives.  
- **Rationale**  
  - Provide 2 sentences explaining the context evaluation.  

---

### **Final Notes**
- Assign a **1-4 score** for each category.
- Provide **brief, constructive feedback** for each section.
- Use **clear and concise language** to explain the rationale for scores.

"""


# Initialize the LLM client
class StructuredTextGeneration:
    def __init__(self, endpoint_url: str, output_structure: BaseModel, timeout: int = 300, max_new_tokens: int = 2000, temperature: float = 0.6):
        self.client = InferenceClient(endpoint_url, timeout=timeout)
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.schema = output_structure.model_json_schema()

    def invoke(self, prompt: str) -> str:
        """ Calls the LLM API to generate a structured evaluation """
        response = self.client.text_generation(
            prompt=f"{prompt}\n\nPlease structure the response using the following schema: {json.dumps(self.schema)}",
            max_new_tokens=self.max_new_tokens,
            temperature=self.temperature,
            grammar={"type": "json", "value": self.schema},
        )
        return response

# Initialize LLM instance (update endpoint URL)
endpoint_url = ":8090" #llama-3.3-70b-awq
textgen = StructuredTextGeneration(endpoint_url, EvaluationScores)


# Generate and evaluate responses
evals = []


for i, row in tqdm.tqdm(df_llm_records.iterrows(), total=len(df_llm_records), desc="Evaluating responses"):
    eval_prompt = f"""{SYSTEM_PROPMT_w_RUBRIC}

    **Student Profile:** {row["student"]}
    **Student Request:** {row["request"]}
    **Generated Response:** {row["response"]}

    Evaluate the response using the provided rubric.
    """

    # Get evaluation from LLM
    response = textgen.invoke(eval_prompt)

    # Parse JSON response
    try:
        eval_result = json.loads(response)
        eval_result["index"] = i  # Retain DataFrame index for reference
        evals.append({
            "type": row["type"],
            "test_case": row["test_case"],
            "student": row["student"],
            "request": row["request"],
            "response": row["response"],
            "evaluation": eval_result 
            })
    except json.JSONDecodeError:
        print(f"Error parsing JSON for test case {row['test_case']}")

# Convert evaluations to a DataFrame
eval_df = pd.DataFrame(evals)




Evaluating responses:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating responses: 100%|██████████| 2/2 [00:36<00:00, 18.17s/it]


In [22]:
# row = df_llm_records.iloc[0]
# eval_prompt = f"""{SYSTEM_PROPMT_w_RUBRIC}

#     **Student Profile:** {row["student"]}
#     **Student Request:** {row["request"]}
#     **Generated Response:** {row["response"]}

#     Evaluate the response using the provided rubric.
#     """

# # Get evaluation from LLM
# response = textgen.invoke(eval_prompt)
# print(response)

In [23]:
# Convert 'evaluation' column from string to dictionary (if necessary)
eval_df['evaluation'] = eval_df['evaluation'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract numerical values from 'evaluation' column, excluding keys ending with '_rationale' or 'comments'
def extract_numeric_values(eval_dict):
    return {key: value for key, value in eval_dict.items()
            if isinstance(value, (int, float)) and not (key.endswith('_rationale') or key == 'comments')}

# Convert 'evaluation' column to a DataFrame with extracted numerical values
numeric_eval_df = eval_df['evaluation'].apply(lambda x: extract_numeric_values(x)).apply(pd.Series)

# Concatenate the original DataFrame with the new numerical columns
eval_df = pd.concat([eval_df, numeric_eval_df], axis=1)

# Drop the original 'evaluation' column if no longer needed
eval_df = eval_df.drop(columns=['evaluation'])

# Save the evaluation results
eval_df.to_csv("evaluations_output.csv", index=False)

In [24]:
response_dir = "response_files"

#write each response to a text file
for i, row in eval_df.iterrows():
    with open(f"response_text_{i}.txt", "w", encoding="utf-8") as text_file:
        text_file.write(f"Student Profile: {row['student']}\n")
        text_file.write(f"Student Request: {row['request']}\n")
        text_file.write(f"Generated Response: {row['response']}\n\n")


In [25]:
# #print the entire evaluations field for the first row
# print(f"eval_df['evaluation'][0] = {eval_df['evaluation'][0]}")
# print(len(eval_df['evaluation']))
# print(f"eval_df columns: {eval_df.columns}")
# print(eval_df.head())
# print(eval_df.dtypes)


In [26]:
# # old summary metrics

# # import ast

# # Convert 'evaluation' column from string to dictionary (if necessary)
# eval_df['evaluation'] = eval_df['evaluation'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# # Extract numerical values from 'evaluation' column, excluding keys ending with '_rationale' or 'comments'
# def extract_numeric_values(eval_dict):
#     return {key: value for key, value in eval_dict.items() 
#             if isinstance(value, (int, float)) and not (key.endswith('_rationale') or key == 'comments')}

# # Convert 'evaluation' column to a DataFrame with extracted numerical values
# numeric_eval_df = eval_df['evaluation'].apply(lambda x: extract_numeric_values(x)).apply(pd.Series)

# # Compute mean and variance for each metric
# metrics_stats = numeric_eval_df.agg(['mean', 'var'])

# # Display the results
# display(metrics_stats)
