In [2]:
# install libraries required
!pip install pydantic-evals   # evaluation framework
!pip install pydantic-ai      # the LLM judge interface
!pip install openai           # used as the judge model




In [3]:
# load libraries
import json
import os
# from google import genai
from google.colab import userdata
from pydantic import BaseModel, Field
from pydantic_evals.evaluators import Evaluator, EvaluationResult
from pydantic_ai import Agent


In [4]:
# import pydantic_evals.evaluators
# print(dir(pydantic_evals.evaluators))

In [5]:
import pydantic_evals
print(dir(pydantic_evals))

['Case', 'Dataset', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_utils', 'dataset', 'evaluators', 'increment_eval_metric', 'otel', 'reporting', 'set_eval_attribute']


In [6]:
# load input files

# load rubric, submission and prompt
with open("rub_it_0002.json", "r") as f:
    rubric_data = json.load(f)

# load llm feedback
with open("FG_Feedback_Results_with_Scores.json", "r") as f:
    feedback_data = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'rub_it_0002.json'

In [None]:
# extract rubric, submission, and feedback
rubric = rubric_data["rubric"]["criteria"]
student_submission = rubric_data["submissions"][0]["final_submission"]
llm_feedback = feedback_data[0]["feedback"]


In [None]:
print("Rubric: ", rubric, "\n")
print("Submission: ", student_submission, "\n")
print("Feedback: ", llm_feedback, "\n")

In [None]:
# define the meta-rubric for the llm judge

META_RUBRIC = """
ou must evaluate the **quality** of the feedback produced by an LLM.

Evaluate the feedback only using the following meta-rubric:

### META-RUBRIC

1. **Accuracy**
   Does the feedback correctly reflect the studentâ€™s submission?

2. **Specificity**
   Is the feedback concrete, detailed, and supported with examples rather than vague advice?

3. **Constructiveness**
   Does the feedback guide the student on how to improve?

4. **Alignment with assignment rubric**
   Does the feedback clearly reference (and stay aligned with) the criteria used to judge the work?

5. **Tone and clarity**
   Is the feedback easy to understand, supportive, and professionally written?

### SCORING INSTRUCTIONS
For each dimension, give a score from **0 to 10** and provide a justification.
Then provide an overall score and a summary.
"""


In [None]:
# Define a sub-model for a single rubric criterion
class CriterionResult(BaseModel):
    score: int = Field(..., description="The score (0-10) for this dimension.")
    explanation: str = Field(..., description="The justification for the score.")

In [None]:
# Define the main output model with explicit fields for each criterion
class JudgeOutput(BaseModel):
    accuracy: CriterionResult
    specificity: CriterionResult
    constructiveness: CriterionResult
    alignment: CriterionResult
    tone_clarity: CriterionResult

    overall_score: float = Field(
        ...,
        description="The average of the five scores, rounded to one decimal place."
    )
    summary: str = Field(
        ...,
        description="A short summary of the feedback's strengths and weaknesses."
    )

In [None]:
# create the llm-as-judge evaluator

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
print("API Key set:", os.environ.get('OPENAI_API_KEY') is not None)

# judge = Agent(model="gpt-5-mini, retries=3")
judge = Agent(model="gpt-5-mini")

class MetaRubricEvaluator(Evaluator):

    async def evaluate(self, case):
        prompt = f"""
You are an expert academic evaluator.

The student's submission is:
--------------------
{student_submission}
--------------------

The feedback you must evaluate is:
--------------------
{llm_feedback}
--------------------

Use the following meta-rubric:
{META_RUBRIC}

Now score the feedback strictly according to the meta-rubric.
"""
        result = await judge.run(
            prompt,
            output_type=JudgeOutput # Pass the schema for structured output
        )
        # return EvaluationResult(model_output=result.output)
        return EvaluationResult(
            name="Meta_Rubric_Score", # Required field
            value=result.output.overall_score, # Extract the scalar score
            reason=result.output.summary, # Extract the summary as the reason
            source=judge.model.model_name
        )

In [None]:
# run the evaluation

evaluator = MetaRubricEvaluator()

case = {"id": "case1"}
result = await evaluator.evaluate(case)
result


In [None]:
# # print full result
# print("\nFull Result Object:")
# print(result)

# pring score and reason
print(f"Score: {result.value}")
print("Reason:")
result.reason
