## Import Libraries

In [74]:
from ollama import chat
from pydantic import BaseModel
from typing import Optional, Any, Dict
import json
from pathlib import Path

## Set Enviornment

In [75]:
INPUT_FILE = 'Milestone-5/math-agent/qwen3-32b-eval.balanced.jsonl'  # <- change this to your file
OUTPUT_FILE = 'qwen3-32b.judgments.jsonl'  # judgments will be written here
MODEL_NAME = 'gemma3:4b'  # change to your model if needed

## Set Rubric

In [76]:
class Rubric(BaseModel):
    correct_answer: bool
    did_it_solve_in_easy_and_fast_approach: int
    did_it_stop_midway: bool
    easy_to_understand_explanation: int
    notes: Optional[str] = None
    score: Optional[float] = None

## Define Judge Prompt

In [77]:
JUDGE_PROMPT = """You are a careful, strict math grader.

Given the QUESTION, the MODEL GENERATED ANSWER, and the EXACT ANSWER, produce a single JSON object matching this schema exactly (no extra keys):

{Rubric.model_json_schema()}

Return only valid JSON.

QUESTION: {q}

MODEL GENERATED ANSWER: {gen}

Notes: decide `correct_answer` by whether the generated answer matches the exact answer (for numeric answers compare numerically when possible). Rate `did_it_solve_in_easy_and_fast_approach` from 1 (very poor) to 10 (excellent). Set `did_it_stop_midway` true if the answer is incomplete or the model says it cannot continue. Rate `easy_to_understand_explanation` from 1 to 10. Provide short `notes` if necessary and optional `score` between 0 and 1.
"""

## Judge Function

In [78]:
def evaluate_entry_with_llm(entry: Dict[str, Any]) -> Rubric:
    """Use the exact JSONL shape you provided: `prompt` for the question and
    `response.choices[0].message.content` for the generated answer.

    This function is intentionally strict to match your examples and avoid
    guessing other key names.
    """

    # Question: use the `prompt` key from your examples
    q = entry.get('prompt') if entry.get('prompt') is not None else entry.get('question', '')

    # Generated answer: expect the structure `response.choices[0].message.content`
    gen = ''
    resp_obj = entry.get('response')
    if isinstance(resp_obj, dict):
        choices = resp_obj.get('choices')
        if isinstance(choices, list) and len(choices) > 0:
            first_choice = choices[0]
            if isinstance(first_choice, dict):
                msg = first_choice.get('message')
                if isinstance(msg, dict) and 'content' in msg:
                    gen = msg['content']
                elif 'text' in first_choice:
                    gen = first_choice.get('text')
                elif 'content' in first_choice:
                    gen = first_choice.get('content')
    # Minimal fallback: if structure isn't as expected, try a few small, safe fallbacks
    if not gen:
        gen = entry.get('response_text') or entry.get('generated_answer') or ''

    # Try to get exact/gold answer if present (safe fallback)
    exact = entry.get('exact-answer') or entry.get('exact_answer') or entry.get('reference') or entry.get('answer') or ''

    # Prepare prompt template: substitute the literal Rubric model schema placeholder
    prompt_template = JUDGE_PROMPT
    if '{Rubric.model_json_schema()}' in prompt_template:
        try:
            prompt_template = prompt_template.replace('{Rubric.model_json_schema()}', Rubric.model_json_schema())
        except Exception:
            # If for some reason schema generation fails, remove the placeholder to avoid KeyError
            prompt_template = prompt_template.replace('{Rubric.model_json_schema()}', '')

    # Format the judge prompt using q, gen, and exact (do not change the prompt text itself)
    try:
        prompt = prompt_template.format(q=q or '', gen=gen or '', exact=exact or '')
    except KeyError:
        # As a last resort, escape braces and format only known placeholders
        safe_template = prompt_template.replace('{', '{{').replace('}', '}}')
        safe_template = safe_template.replace('{{q}}', '{q}').replace('{{gen}}', '{gen}').replace('{{exact}}', '{exact}')
        prompt = safe_template.format(q=q or '', gen=gen or '', exact=exact or '')

    # Call the LLM (same as before)
    resp = chat(
        messages=[{'role': 'user', 'content': prompt}],
        model=MODEL_NAME,
        format=Rubric.model_json_schema(),
    )

    # Extract response content and validate the JSON returned by the model
    content = None
    # Typical ollama response provides resp.message.content
    if hasattr(resp, 'message') and getattr(resp, 'message') is not None:
        try:
            content = getattr(resp.message, 'content') if hasattr(resp.message, 'content') else (resp.message.get('content') if isinstance(resp.message, dict) else None)
        except Exception:
            content = None

    if content is None:
        if isinstance(resp, str):
            content = resp
        elif isinstance(resp, dict):
            content = resp.get('content') or resp.get('message') or json.dumps(resp, ensure_ascii=False)
        else:
            content = str(resp)

    try:
        rubric = Rubric.model_validate_json(content)
    except Exception:
        try:
            parsed = json.loads(content) if isinstance(content, str) else content
            rubric = Rubric.model_validate(parsed)
        except Exception as e:
            raise RuntimeError(f'Failed to parse/validate rubric from model response: {e}')

    return rubric


In [79]:


# def simple_fallback_evaluate(entry: Dict[str, Any]) -> Rubric:
#     # Lightweight heuristic fallback when the LLM client is unavailable or errors occur
#     gen = (entry.get('generated_answer') or '').strip()
#     exact = entry.get('exact-answer')
#     correct = False
#     # Try numeric comparison when possible
#     try:
#         token = gen.split()[0] if gen.split() else ''
#         gval = float(token)
#         evalv = float(exact)
#         correct = abs(gval - evalv) < 1e-6
#     except Exception:
#         # fallback to simple string compare (case-insensitive, stripped)
#         if isinstance(exact, str) and gen.lower() == exact.strip().lower():
#             correct = True
#     stopped = gen.endswith('...') or 'cannot' in gen.lower() or "can't" in gen.lower()
#     easy = 8 if len(gen.split()) <= 30 else max(1, 10 - (len(gen.split()) // 50))
#     fast = 8 if len(gen.split()) <= 50 else 5
#     score = 1.0 if correct else 0.0
#     return Rubric(correct_answer=correct, did_it_solve_in_easy_and_fast_approach=(easy+fast)//2, did_it_stop_midway=stopped, easy_to_understand_explanation=easy, notes=None, score=score)




## Call Judge

In [80]:
# Main loop: read input JSONL and produce output JSONL with judgments
in_path = Path(INPUT_FILE)
out_path = Path(OUTPUT_FILE)

if not in_path.exists():
    raise FileNotFoundError(f'Input file not found: {in_path.resolve()}')

with in_path.open('r', encoding='utf-8') as fin, out_path.open('w', encoding='utf-8') as fout:
    for i, line in enumerate(fin, start=1):
        line = line.strip()
        if not line:
            continue
        entry = json.loads(line)
        try:
            # Try structured LLM judgment first
            rubric = evaluate_entry_with_llm(entry)
        except Exception as e:
            # On errors (eg. no ollama client), fall back to simple heuristics
            print(f'Warning: evaluation failed for entry {i}: {e}; using fallback heuristic')
            # Lightweight heuristic fallback when the LLM client is unavailable or errors occur
            gen = ''
            # Try to extract a best-effort generated answer
            resp_obj = entry.get('response')
            if isinstance(resp_obj, dict):
                choices = resp_obj.get('choices')
                if isinstance(choices, list) and choices:
                    first_choice = choices[0]
                    if isinstance(first_choice, dict):
                        msg = first_choice.get('message')
                        if isinstance(msg, dict) and 'content' in msg:
                            gen = msg['content']
                        elif 'text' in first_choice:
                            gen = first_choice.get('text')
                        elif 'content' in first_choice:
                            gen = first_choice.get('content')
            if not gen:
                gen = entry.get('response_text') or entry.get('generated_answer') or entry.get('answer') or ''

            exact = entry.get('exact-answer') or entry.get('exact_answer') or entry.get('reference') or entry.get('answer') or ''

            correct = False
            try:
                token = gen.strip().split()[0] if gen and gen.strip().split() else ''
                gval = float(token)
                evalv = float(exact)
                correct = abs(gval - evalv) < 1e-6
            except Exception:
                if isinstance(exact, str) and isinstance(gen, str) and gen.strip().lower() == str(exact).strip().lower():
                    correct = True

            stopped = isinstance(gen, str) and (gen.strip().endswith('...') or 'cannot' in gen.lower() or "can't" in gen.lower())
            easy = 8 if isinstance(gen, str) and len(gen.split()) <= 30 else max(1, 10 - (len(str(gen).split()) // 50))
            fast = 8 if isinstance(gen, str) and len(gen.split()) <= 50 else 5
            score = 1.0 if correct else 0.0

            rubric = Rubric(
                correct_answer=correct,
                did_it_solve_in_easy_and_fast_approach=(easy + fast) // 2,
                did_it_stop_midway=stopped,
                easy_to_understand_explanation=easy,
                notes=f'Fallback used due to error: {e}',
                score=score,
            )

        output = {
            'judgment': rubric.model_dump(),
        }
        fout.write(json.dumps(output, ensure_ascii=False) + '\n')

print(f'Finished evaluating. Output written to: {out_path.resolve()}')


Finished evaluating. Output written to: /home/nailsonseat/StudioProjects/CourseGPT-Pro-DSAI-Lab-Group-6/Milestone-5/math-agent/qwen3-32b.judgments.jsonl
