In [1]:
# install libraries required
!pip install deepeval         # evaluation framework
# !pip install pydantic-ai      # the LLM judge interface
!pip install openai           # used as the judge model


Collecting deepeval
  Downloading deepeval-3.7.5-py3-none-any.whl.metadata (18 kB)
Collecting click<8.3.0,>=8.0.0 (from deepeval)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc<2.0.0,>=1.24.0 (from deepeval)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=5.4.0 (from deepeval)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting pyfiglet (from deepeval)
  Downloading pyfiglet-1.0.4-py3-none-any.whl.metadata (7.4 kB)
Collecting pytest-asyncio (from deepeval)
  Downloading pytest_asyncio-1.3.0-py3-none-any.whl.metadata (4.1 kB)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.4-py3-none-any.whl.metadata (4.9 kB)
Collecting pytest-rerunfailures (from deepeval)
  Downloading pytest_rerunfailures-16.1-py3-n

In [9]:
# load libraries
import json
import os
# from google import genai
from google.colab import userdata
# from pydantic import BaseModel, Field
# from pydantic_evals.evaluators import Evaluator, EvaluationResult
# from pydantic_ai import Agent

# DeepEval imports for G-Eval
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

In [None]:
# import pydantic_evals.evaluators
# print(dir(pydantic_evals.evaluators))

In [None]:
# import pydantic_evals
# print(dir(pydantic_evals))

['Case', 'Dataset', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_utils', 'dataset', 'evaluators', 'increment_eval_metric', 'otel', 'reporting', 'set_eval_attribute']


In [3]:
# load input files

# load rubric, submission and prompt
with open("rub_it_0002.json", "r") as f:
    rubric_data = json.load(f)

# load llm feedback
with open("FG_Feedback_Results_with_Scores.json", "r") as f:
    feedback_data = json.load(f)


In [4]:
# extract rubric, submission, and feedback
rubric = rubric_data["rubric"]["criteria"]
student_submission = rubric_data["submissions"][0]["final_submission"]
llm_feedback = feedback_data[0]["feedback"]


In [5]:
print("Rubric: ", rubric, "\n")
print("Submission: ", student_submission, "\n")
print("Feedback: ", llm_feedback, "\n")

Rubric:  [{'criterion_id': 'c1', 'name': 'Conceptual Understanding', 'description': 'Demonstrates deep understanding of the topic and related concepts.', 'performance_descriptors': {'excellent': 'Shows precise and in-depth understanding with clear theoretical support.', 'good': 'Understands key concepts well with minor gaps.', 'average': 'Adequate explanation with some inaccuracies.', 'needs_improvement': 'Surface-level or confused understanding.', 'poor': 'Major misunderstandings or misrepresentations.'}, 'weight': 15}, {'criterion_id': 'c2', 'name': 'Application to Real-World Scenarios', 'description': 'Applies concepts meaningfully to examples or scenarios.', 'performance_descriptors': {'excellent': 'Uses insightful and relevant examples clearly linked to concepts.', 'good': 'Examples mostly appropriate with some gaps in explanation.', 'average': 'Examples present but superficial or unclear connections.', 'needs_improvement': 'Vague or generic examples.', 'poor': 'No real-world appl

In [6]:
# define the meta-rubric for the llm judge

META_RUBRIC = """
ou must evaluate the **quality** of the feedback produced by an LLM.

Evaluate the feedback only using the following meta-rubric:

### META-RUBRIC

1. **Accuracy**
   Does the feedback correctly reflect the student’s submission?

2. **Specificity**
   Is the feedback concrete, detailed, and supported with examples rather than vague advice?

3. **Constructiveness**
   Does the feedback guide the student on how to improve?

4. **Alignment with assignment rubric**
   Does the feedback clearly reference (and stay aligned with) the criteria used to judge the work?

5. **Tone and clarity**
   Is the feedback easy to understand, supportive, and professionally written?

### SCORING INSTRUCTIONS
For each dimension, give a score from **0 to 10** and provide a justification.
Then provide an overall score and a summary.
"""


In [None]:
# # Define a sub-model for a single rubric criterion
# class CriterionResult(BaseModel):
#     score: int = Field(..., description="The score (0-10) for this dimension.")
#     explanation: str = Field(..., description="The justification for the score.")

In [None]:
# # Define the main output model with explicit fields for each criterion
# class JudgeOutput(BaseModel):
#     accuracy: CriterionResult
#     specificity: CriterionResult
#     constructiveness: CriterionResult
#     alignment: CriterionResult
#     tone_clarity: CriterionResult

#     overall_score: float = Field(
#         ...,
#         description="The average of the five scores, rounded to one decimal place."
#     )
#     summary: str = Field(
#         ...,
#         description="A short summary of the feedback's strengths and weaknesses."
#     )

In [None]:
# # create the llm-as-judge evaluator

# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
# print("API Key set:", os.environ.get('OPENAI_API_KEY') is not None)

# # judge = Agent(model="gpt-5-mini, retries=3")
# judge = Agent(model="gpt-5-mini")

# class MetaRubricEvaluator(Evaluator):

#     async def evaluate(self, case):
#         prompt = f"""
# You are an expert academic evaluator.

# The student's submission is:
# --------------------
# {student_submission}
# --------------------

# The feedback you must evaluate is:
# --------------------
# {llm_feedback}
# --------------------

# Use the following meta-rubric:
# {META_RUBRIC}

# Now score the feedback strictly according to the meta-rubric.
# """
#         result = await judge.run(
#             prompt,
#             output_type=JudgeOutput # Pass the schema for structured output
#         )
#         # return EvaluationResult(model_output=result.output)
#         return EvaluationResult(
#             name="Meta_Rubric_Score", # Required field
#             value=result.output.overall_score, # Extract the scalar score
#             reason=result.output.summary, # Extract the summary as the reason
#             source=judge.model.model_name
#         )

API Key set: True


In [18]:
# set the API key
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# define the G-Eval metric using meta rubrci defined
meta_evaluator = GEval(
    name="Feedback Quality",
    criteria=META_RUBRIC,
    evaluation_params=[
        LLMTestCaseParams.INPUT,          # submission
        LLMTestCaseParams.ACTUAL_OUTPUT,  # LLM generated feedback
        LLMTestCaseParams.CONTEXT         # original feedback rubrick
    ],
    model="gpt-4o"  # G-Eval works best with GPT-4 level models
    # model="gpt-5-mini"
)

In [16]:
# # run the evaluation

# evaluator = MetaRubricEvaluator()

# case = {"id": "case1"}
# result = await evaluator.evaluate(case)
# result


In [None]:
# # # print full result
# # print("\nFull Result Object:")
# # print(result)

# # pring score and reason
# print(f"Score: {result.value}")
# print("Reason:")
# result.reason


Score: 8.4
Reason:


"Strengths: The feedback is positive, aligned with the rubric, and offers a clear, actionable next step (investigate case studies and bypass techniques). It also scores the student's performance across appropriate categories and maintains a supportive tone. Weaknesses: The feedback occasionally mislabels the submission as 'prompts' instead of an essay, and it is light on concrete examples and specific resources or revision tasks. To improve, the educator should cite brief examples from the submission to justify each rubric score and recommend specific case studies or readings the student could examine."

In [19]:
# create the test case

# input = Student's work
# actual_output = The feedback we are grading
# context = The original rubric (reference material)
test_case = LLMTestCase(
    input=student_submission,
    actual_output=llm_feedback,
    context=[str(rubric)]
)

# run the G-Eval
meta_evaluator.measure(test_case)

Output()

G-Eval Score: 0.8234250424415979
Reasoning:
The feedback accurately reflects the student's comprehensive understanding of MFA, aligning well with the assignment rubric's criteria for conceptual understanding and critical evaluation. It provides specific suggestions for improvement, such as exploring case studies, which enhances its constructiveness. However, the feedback could be more specific by referencing particular sections of the student's work to improve specificity. The tone is supportive and clear, maintaining professional clarity throughout.


In [24]:
print(f"G-Eval Score: {meta_evaluator.score:.2f}")
print("Reason")
meta_evaluator.reason

G-Eval Score: 0.82
Reason


"The feedback accurately reflects the student's comprehensive understanding of MFA, aligning well with the assignment rubric's criteria for conceptual understanding and critical evaluation. It provides specific suggestions for improvement, such as exploring case studies, which enhances its constructiveness. However, the feedback could be more specific by referencing particular sections of the student's work to improve specificity. The tone is supportive and clear, maintaining professional clarity throughout."

In [21]:
meta_evaluator.evaluation_steps

["Evaluate the Accuracy by comparing the feedback to the student's submission to ensure it correctly reflects the content and intent of the student's work.",
 'Assess the Specificity by checking if the feedback includes concrete details and examples, rather than vague or general advice.',
 'Examine the Constructiveness by determining if the feedback provides clear guidance on how the student can improve their work.',
 'Check the Alignment with assignment rubric by ensuring the feedback references and adheres to the criteria outlined in the assignment rubric.',
 'Review the Tone and clarity by evaluating if the feedback is easy to understand, supportive, and professionally written.']