In [None]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch

ds = load_dataset("IUTVanguard/PhysicsEval", split="test")

def filter_category(row):
    return row["category"] in [
        "Classical Mechanics and Dynamics",
        "Fluid Mechanics and Continuum Dynamics"
    ]

filtered_ds = ds.filter(filter_category)

print(f"Filtered dataset size: {len(filtered_ds)}")



model_name = "ojus1/Qwen3-0.6B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

model.eval()

OUTPUT_FILE = "qwen_physics_outputs.jsonl"

with open(OUTPUT_FILE, "w") as f:
    for row in tqdm(filtered_ds):

        prompt = row["simplified_problem_statement"]

        messages = [
            {"role": "user", "content": prompt}
        ]

        inputs = tokenizer.apply_chat_template(
          messages,
          add_generation_prompt=True,
          tokenize=True,
          return_dict=True,
          return_tensors="pt",
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5000
            )

        gen_text = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        ).strip()

        record = {
            "problem_id": row["Problem_ID"],
            "category": row["category"],
            "question": row["simplified_problem_statement"],
            "model_output": gen_text,
            "reference_answer": row["final_answers_in_brief"],
        }

        f.write(json.dumps(record, ensure_ascii=False) + "\n")


In [None]:
%pip install jsonl2json

from jsonl2json import JsonlToJsonFormatter

jsonl = JsonlToJsonFormatter('qwen3-0.6B_eval.jsonl', 'qwen3-0.6B_eval.json')
jsonl.to_json()


Defaulting to user installation because normal site-packages is not writeable
Collecting jsonl2json
  Using cached jsonl2json-1.0.0-py3-none-any.whl (3.9 kB)
Installing collected packages: jsonl2json
Successfully installed jsonl2json-1.0.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: C:\Program Files\Python310\python.exe -m pip install --upgrade pip


In [7]:
!pip install huggingface_hub 
from huggingface_hub import notebook_login

notebook_login()


Collecting huggingface_hub
  Downloading huggingface_hub-1.2.3-py3-none-any.whl (520 kB)
     -------------------------------------- 521.0/521.0 kB 2.2 MB/s eta 0:00:00
Collecting packaging>=20.9
  Using cached packaging-25.0-py3-none-any.whl (66 kB)
Collecting pyyaml>=5.1
  Using cached pyyaml-6.0.3-cp310-cp310-win_amd64.whl (158 kB)
Collecting filelock
  Downloading filelock-3.20.2-py3-none-any.whl (16 kB)
Collecting hf-xet<2.0.0,>=1.2.0
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl (2.9 MB)
     ---------------------------------------- 2.9/2.9 MB 2.0 MB/s eta 0:00:00
Collecting typing-extensions>=3.7.4.3
  Using cached typing_extensions-4.15.0-py3-none-any.whl (44 kB)
Collecting fsspec>=2023.5.0
  Using cached fsspec-2025.12.0-py3-none-any.whl (201 kB)
Collecting shellingham
  Using cached shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)
Collecting tqdm>=4.42.1
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting typer-slim
  Downloading typer_slim-0.21.0-py3-none-any.


[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj=r"D:\physics-reasoning\smol-phy-reasoning\qwen3-0.6B_eval.jsonl",
    path_in_repo="qwen-0.6B_ans_physics_eval",
    repo_id="DrDrunkenstein22/qwen_eval_ans",
    repo_type="dataset",
)   

CommitInfo(commit_url='https://huggingface.co/datasets/DrDrunkenstein22/qwen_eval_ans/commit/3c357378f0eb83a6d5701071cd74d86be657ba93', commit_message='Upload qwen-0.6B_ans_physics_eval with huggingface_hub', commit_description='', oid='3c357378f0eb83a6d5701071cd74d86be657ba93', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DrDrunkenstein22/qwen_eval_ans', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DrDrunkenstein22/qwen_eval_ans'), pr_revision=None, pr_num=None)

In [None]:
# qwen revised eval answers set w/ elaborate solution steps for comparison 
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch

ds = load_dataset("IUTVanguard/PhysicsEval", split="test")

def filter_category(row):
    return row["category"] in [
        "Classical Mechanics and Dynamics",
        "Fluid Mechanics and Continuum Dynamics"
    ]

filtered_ds = ds.filter(filter_category)
print(f"Filtered dataset size: {len(filtered_ds)}")

model_name = "ojus1/Qwen3-0.6B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="balanced"   # on kaggle 2*t4 gpus faster than auto device mapping 
)
model.eval()

OUTPUT_FILE = "/kaggle/working/qwen_final_eval.json"

with open(OUTPUT_FILE, "w") as f:
    for row in tqdm(filtered_ds):
        prompt = row["simplified_problem_statement"]
        messages = [
            {"role": "user", "content": prompt}
        ]
        inputs = tokenizer.apply_chat_template(
          messages,
          add_generation_prompt=True,
          tokenize=True,
          return_dict=True,
          return_tensors="pt",
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=2048, 
                do_sample=False, 
                temperature=1.0
            )
        
        gen_text = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        ).strip()
        
        record = {
            "problem_id": row["Problem_ID"],
            "category": row["category"],
            "question": row["simplified_problem_statement"],
            "model_output": gen_text,
            "elaborate_solution": row["elaborated_solution_steps"], 
            "reference_answer": row["final_answers_in_brief"],
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

In [12]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

load_dotenv()
Pineapple_api_key = os.getenv("ANANNAS_API_KEY")




def safe_json_loads(text):
    if not text or not text.strip():
        return None
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # Try to extract first JSON object
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1:
            try:
                return json.loads(text[start:end+1])
            except json.JSONDecodeError:
                return None
        return None


def fetch_eval_prompt(problem_id, elaborate_solution, model_output, question):
    prompt = f"""You are an expert physics problem evaluator. Your task is to meticulously and STRICTLY evaluate an AI-generated solution based on its own merits and against the provided elaborated solution steps.

Evaluate the AI-generated solution based on the following categories and scoring guidelines. Provide your evaluation STRICTLY as a JSON object.

Evaluation Categories and Scoring Guidelines:

1. **mathematical_accuracy**: (Score 1-5)
   How correct are the AI's calculations, numerical answers, and units?
   * 5: All calculations, numerical results, and units are perfectly correct
   * 4: Minor calculation error, but underlying method is sound
   * 3: Several minor errors, or one significant calculation error
   * 2: Major calculation errors or fundamental misunderstandings
   * 1: Almost all calculations are incorrect or missing

2. **logical_consistency**: (Score 1-5)
   Does the AI solution follow a logical step-by-step progression?
   * 5: Perfect logical flow, impeccable reasoning
   * 4: Mostly logical, minor unclear step
   * 3: Some logical gaps or inconsistencies
   * 2: Significant logical flaws
   * 1: Illogical or incoherent

3. **completeness**: (Score 1-5)
   Does the AI-generated solution address all parts of the problem?
   * 5: All parts fully addressed
   * 4: Minor aspect overlooked
   * 3: Significant part ignored
   * 2: Only small portion addressed
   * 1: Largely unaddressed

4. **clarity_and_coherence**: (Score 1-5)
   Is the AI's explanation clear and easy to understand?
   * 5: Exceptionally clear and well-structured
   * 4: Clear with minor areas for improvement
   * 3: Generally understandable but verbose or unclear in parts
   * 2: Difficult to understand
   * 1: Incomprehensible

5. **formulas_principles**: (Score 1-5)
   Are correct physical formulas and principles applied correctly?
   * 5: All necessary formulas correctly identified and applied
   * 4: Mostly correct with minor errors
   * 3: Some incorrect formulas or significant misapplication
   * 2: Major errors in formula/principle selection
   * 1: Completely inappropriate formulas

6. **assumptions_made**: (Score 1-5)
   Are AI assumptions explicit, justified, and reasonable?
   * 5: All assumptions explicitly stated and well-justified
   * 4: Most assumptions stated and reasonable
   * 3: Some key assumptions missing or questionable
   * 2: Major unreasonable assumptions
   * 1: Inappropriate or absent assumptions

7. **overall_correctness**: (Score 0-10)
   How correct is the AI's approach and final answer overall?
   * 10: Perfect solution
   * 8-9: Excellent, very minor flaws
   * 6-7: Good, largely correct
   * 4-5: Partially correct
   * 2-3: Mostly incorrect
   * 0-1: Completely incorrect

Physics Question: {question}

Problem ID: {problem_id}

Elaborated Solution Steps: {elaborate_solution}

AI-Generated Solution to Evaluate: {model_output}

Provide ONLY a JSON object with the problem_id and scores for each category.

Example JSON format:
{{
  "problem_id": "{problem_id}",
  "mathematical_accuracy": <score_1_to_5>,
  "logical_consistency": <score_1_to_5>,
  "completeness": <score_1_to_5>,
  "clarity_and_coherence": <score_1_to_5>,
  "formulas_principles": <score_1_to_5>,
  "assumptions_made": <score_1_to_5>,
  "overall_correctness": <score_0_to_10>
}}
"""
    return prompt


def process_jsonl(in_path: Path, out_path: Path = None):
    """
    Process JSONL file and evaluate each entry using LLM judge
    """
    if out_path is None:
        out_path = in_path.parent / f"{in_path.stem}_evaluated.json"
    
    qwen_eval_data = []
    prob_ids_processed = set()
    
    # Load previously evaluated items if output file exists
    if os.path.exists(out_path):
        with open(out_path, 'r', encoding='utf-8') as f:
            judge_data = json.load(f)
            if isinstance(judge_data, list):
                qwen_eval_data.extend(i for i in judge_data if isinstance(i, dict))
                prob_ids_processed.update(i.get('problem_id') for i in qwen_eval_data if i.get('problem_id'))
        logger.info(f"Loaded {len(qwen_eval_data)} previously evaluated items from {out_path}")
    
    ids_left = []
    with open(in_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            problem_id = item.get('problem_id')
            if problem_id not in prob_ids_processed:
                if item.get('model_output') and item.get('elaborate_solution'):
                    ids_left.append(item)
    
    logger.info(f"Found {len(ids_left)} items to evaluate")
    
    if not ids_left:
        logger.info("No new items to evaluate")
        return qwen_eval_data
    
    client = OpenAI(
        base_url="https://api.anannas.ai/v1",
        api_key=Pineapple_api_key
    )
    
    for j, item in enumerate(ids_left):
        problem_id = item.get('problem_id')
        logger.info(f"Processing item {j + 1}/{len(ids_left)}: {problem_id}")
        
        prompt = fetch_eval_prompt(
            problem_id,
            item.get('elaborate_solution'),
            item.get('model_output'),
            item.get('question', '')
        )
        
        response = client.chat.completions.create(
            model="glm-4.5-flash", 
            messages=[
                {"role": "user", "content": prompt}
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "physics_evaluation",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "problem_id": {"type": "string"},
                            "mathematical_accuracy": {"type": "integer"},
                            "logical_consistency": {"type": "integer"},
                            "completeness": {"type": "integer"},
                            "clarity_and_coherence": {"type": "integer"},
                            "formulas_principles": {"type": "integer"},
                            "assumptions_made": {"type": "integer"},
                            "overall_correctness": {"type": "integer"}
                        },
                        "required": [
                            "problem_id", "mathematical_accuracy", "logical_consistency",
                            "completeness", "clarity_and_coherence", "formulas_principles",
                            "assumptions_made", "overall_correctness"
                        ],
                        "additionalProperties": False
                    }
                }
            },
            temperature=0.3,
            # max_tokens=1024,
            # top_p=0.5
        )
        
        content = response.choices[0].message.content
        logger.info(f"RAW MODEL OUTPUT:\n{repr(content)}")

        
        eval_result = safe_json_loads(content)
        if eval_result is None:
            logger.error(f"Invalid JSON for problem_id={problem_id}")
            continue

        qwen_eval_data.append(eval_result)
        
        # checkpointing evals 
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(qwen_eval_data, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Evaluated {problem_id}: Overall correctness = {eval_result.get('overall_correctness')}/10")
    
    logger.info(f"Evaluation complete. Total evaluated: {len(qwen_eval_data)}")
    logger.info(f"Results saved to: {out_path}")
    
    return qwen_eval_data


if __name__ == "__main__":
    input_file = Path(r"D:\physics-reasoning\smol-phy-reasoning\qwen_final_phy_eval.json")
    output_file = Path(r"D:\physics-reasoning\smol-phy-reasoning\qwen_final_phy_eval_results.json")
    
    results = process_jsonl(input_file, output_file)
    print(f"\nEvaluation Summary:")
    print(f"Total items evaluated: {len(results)}")
    if results:
        avg_score = sum(r.get('overall_correctness', 0) for r in results) / len(results)
        print(f"Average overall correctness: {avg_score:.2f}/10")

2026-01-08 01:50:03,928 - INFO - Found 473 items to evaluate
2026-01-08 01:50:03,960 - INFO - Processing item 1/473: 010e7e8d-e9e0-4c65-a51b-9a906b13b160
2026-01-08 01:50:07,078 - INFO - HTTP Request: POST https://api.anannas.ai/v1/chat/completions "HTTP/1.1 503 Service Unavailable"
2026-01-08 01:50:07,096 - INFO - Retrying request to /chat/completions in 0.443823 seconds
2026-01-08 01:50:09,861 - INFO - HTTP Request: POST https://api.anannas.ai/v1/chat/completions "HTTP/1.1 503 Service Unavailable"
2026-01-08 01:50:09,864 - INFO - Retrying request to /chat/completions in 0.908772 seconds
2026-01-08 01:50:52,853 - INFO - HTTP Request: POST https://api.anannas.ai/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-08 01:50:52,863 - INFO - RAW MODEL OUTPUT:
'```json\n{\n  "problem_id": "010e7e8d-e9e0-4c65-a51b-9a906b13b160",\n  "mathematical_accuracy": 2,\n  "logical_consistency": 3,\n  "completeness": 3,\n  "clarity_and_coherence": 4,\n  "formulas_principles": 2,\n  "assumptions_made": 3,\n  

InternalServerError: Error code: 503 - {'error': {'message': 'all completion attempts failed: glm-4.5-flash (service temporarily unavailable due to high demand. Please retry in a moment)', 'type': 'api_error'}}