## Import Libraries

In [2]:
from ollama import chat
from pydantic import BaseModel
from typing import Optional, Any, Dict
import json
from pathlib import Path

ModuleNotFoundError: No module named 'ollama'

## Set Enviornment

In [None]:
INPUT_FILE = 'responses.jsonl'  # <- change this to your file
OUTPUT_FILE = 'judgments.jsonl'  # judgments will be written here
MODEL_NAME = 'gemma3:4b'  # change to your model if needed

## Set Rubric

In [None]:
class Rubric(BaseModel):
    correct_answer: bool
    did_it_solve_in_easy_and_fast_approach: int
    did_it_stop_midway: bool
    easy_to_understand_explanation: int
    notes: Optional[str] = None
    score: Optional[float] = None

## Define Judge Prompt

In [None]:
JUDGE_PROMPT = """You are a careful, strict math grader.

Given the QUESTION, the MODEL GENERATED ANSWER, and the EXACT ANSWER, produce a single JSON object matching this schema exactly (no extra keys):

{Rubric.model_json_schema()}

Return only valid JSON.

QUESTION: {q}

MODEL GENERATED ANSWER: {gen}

EXACT ANSWER: {exact}

Notes: decide `correct_answer` by whether the generated answer matches the exact answer (for numeric answers compare numerically when possible). Rate `did_it_solve_in_easy_and_fast_approach` from 1 (very poor) to 10 (excellent). Set `did_it_stop_midway` true if the answer is incomplete or the model says it cannot continue. Rate `easy_to_understand_explanation` from 1 to 10. Provide short `notes` if necessary and optional `score` between 0 and 1.
"""

## Judge Function

In [None]:
def evaluate_entry_with_llm(entry: Dict[str, Any]) -> Rubric:
    q = entry.get('question')
    gen = entry.get('generated_answer')
    exact = entry.get('exact-answer')
    prompt = JUDGE_PROMPT.format(q=q, gen=gen, exact=exact)

    resp = chat(
        messages=[{'role': 'user', 'content': prompt}],
        model=MODEL_NAME,
        format=Rubric.model_json_schema(),
    )

    # Validate the returned JSON against the Rubric model
    rubric = Rubric.model_validate_json(resp.message.content)
    return rubric

In [None]:


# def simple_fallback_evaluate(entry: Dict[str, Any]) -> Rubric:
#     # Lightweight heuristic fallback when the LLM client is unavailable or errors occur
#     gen = (entry.get('generated_answer') or '').strip()
#     exact = entry.get('exact-answer')
#     correct = False
#     # Try numeric comparison when possible
#     try:
#         token = gen.split()[0] if gen.split() else ''
#         gval = float(token)
#         evalv = float(exact)
#         correct = abs(gval - evalv) < 1e-6
#     except Exception:
#         # fallback to simple string compare (case-insensitive, stripped)
#         if isinstance(exact, str) and gen.lower() == exact.strip().lower():
#             correct = True
#     stopped = gen.endswith('...') or 'cannot' in gen.lower() or "can't" in gen.lower()
#     easy = 8 if len(gen.split()) <= 30 else max(1, 10 - (len(gen.split()) // 50))
#     fast = 8 if len(gen.split()) <= 50 else 5
#     score = 1.0 if correct else 0.0
#     return Rubric(correct_answer=correct, did_it_solve_in_easy_and_fast_approach=(easy+fast)//2, did_it_stop_midway=stopped, easy_to_understand_explanation=easy, notes=None, score=score)




## Call Judge

In [None]:
# Main loop: read input JSONL and produce output JSONL with judgments
in_path = Path(INPUT_FILE)
out_path = Path(OUTPUT_FILE)

if not in_path.exists():
    raise FileNotFoundError(f'Input file not found: {in_path.resolve()}')

with in_path.open('r', encoding='utf-8') as fin, out_path.open('w', encoding='utf-8') as fout:
    for i, line in enumerate(fin, start=1):
        line = line.strip()
        if not line:
            continue
        entry = json.loads(line)
        try:
            # Try structured LLM judgment first
            rubric = evaluate_entry_with_llm(entry)
        except Exception as e:
            # On errors (eg. no ollama client), fall back to simple heuristics
            print(f'Warning: evaluation failed for entry {i}: {e}; using fallback heuristic')
            # rubric = simple_fallback_evaluate(entry)

        output = {
            'question': entry.get('question'),
            'generated_answer': entry.get('generated_answer'),
            'exact-answer': entry.get('exact-answer'),
            'judgment': rubric.model_dump(),
        }
        fout.write(json.dumps(output, ensure_ascii=False) + '\n')

print(f'Finished evaluating. Output written to: {out_path.resolve()}')