In [1]:
import os, pandas as pd
import json
import time
from openai import OpenAI
import together
import random
from dotenv import load_dotenv
import google.generativeai as genai
import anthropic
from concurrent.futures import ThreadPoolExecutor
from itertools import combinations


In [2]:
# === Load Keys ===
load_dotenv()
together.api_key = os.getenv("TOGETHER_API_KEY")
client = together.Together()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
claude_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))


# MODEL_GENERATOR = "gemini-2.5-flash"
MODEL_mx = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_gpt = "gpt-4.1-2025-04-14"
MODEL_o3 = "o3"
MODEL_cld = "claude-3-5-sonnet-20241022"
MODEL_cld4 = "claude-sonnet-4-20250514"
MODEL_gemini = "gemini-2.5-pro"
MODEL_gemini_flash = "gemini-2.5-flash"

In [3]:
# === Utility: Call LLM ===
# ---openai---
def call_model_openai(prompt: str, model: str) -> str:
    response = openai_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

#---gemini---

def call_model_gemini(prompt: str, model: str) -> str:
    model = genai.GenerativeModel(model_name=model)
    response = model.generate_content(prompt)
    return response.text.strip()

# ---together---

def call_model_tog(prompt: str, model: str) -> str:
    prompt = prompt
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=1.0
    )
    return response.choices[0].message.content.strip()

# ---claude---

def call_model_claude(prompt: str, model: str) -> str:
    try:
        response = claude_client.messages.create(
            model=model,
            max_tokens=1024,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text.strip()
    except Exception as e:
        print("[Claude Error]", e)
        return "[Error]" 

In [4]:


# === Step 5: Get Final Answer from 2 Models ===
def get_final_answer(question_json: str) -> dict:
    prompt = f"""
You are a highly precise and expert-level science problem solver.

Your task is to solve the following question and return **only the final boxed numeric result with proper SI units**, without showing any steps or reasoning.

---

### STRICT OUTPUT RULES:
- Do NOT show any working, steps, or explanations.
- Do NOT return anything other than the **final boxed answer**.
- Use correct scientific symbols and SI units (e.g., `mol`, `kg·m/s²`, `J/K`, `g/L`, `nm`, `m/s`).
- If symbolic constants (e.g., `π`, `ln(2)`, `e`, `R`) are involved, evaluate numerically to **at least 3 significant digits**.
- If the question has no answer or is malformed, return exactly: `"ERROR: Unsolvable or incomplete input."`

---

### SCIENCE QUESTION INPUT:
{question_json}

---

### FINAL OUTPUT:
**Strictly** Return just a single string with the *boxed numeric result*. *No markdown*, *no JSON*.
→ `"v = 2.67 m/s"`  
→ `"ΔH = 35.8 kJ/mol"`  
→ `"F = 9.81 N"`  
→ `"Rate = 0.23 mol/L·s"`  
→ `"ERROR: Unsolvable or incomplete input."`
"""

    with ThreadPoolExecutor() as executor:
        futures = {
            "gpt_o3": executor.submit(call_model_openai, prompt, MODEL_o3),
            "gpt_4_1": executor.submit(call_model_openai, prompt, MODEL_gpt),
            "claude_sonnet_4": executor.submit(call_model_claude, prompt, MODEL_cld4),
            "gemini_2_5_pro": executor.submit(call_model_gemini, prompt, MODEL_gemini),
            "gemini_flash": executor.submit(call_model_gemini, prompt, MODEL_gemini_flash),
        }
        
        return {k: f.result() for k, f in futures.items()}
    

In [5]:

# === Step 6: Compare Answers ===
def compare_answers(ans1: str, ans2: str) -> dict:
    prompt = f"""
You are a high-accuracy **scientific answer comparison engine**. Your role is to compare two final answers generated by different models for **science-based questions** (Physics, Chemistry, or Biology).

Your task is to evaluate whether both answers express the **same scientific result**, accounting for:
- Unit compatibility
- Numerical closeness
- Symbolic or structural equivalence
- Scientific consistency

---

### Evaluation Rubric

Each of the following contributes to a **similarity score** between 0.0 and 1.0 (total 100%). The final decision is based on the score.

| Criterion                    | Weight | Description |
|-----------------------------|--------|-------------|
| 1. Unit or Label Match      | 0.25   | Full score if units are the same or dimensionally equivalent (e.g., kJ/mol ≈ kcal/mol, N·m vs J, g vs mg). Applies to biological labels too (e.g., 'allele frequency', 'enzyme activity'). |
| 2. Numerical Closeness      | 0.30   | Based on relative error. Full score if relative error < 1%, partial if between 1–2%, zero if >2%. |
| 3. Symbolic/Expression Match| 0.15   | Covers symbolic equivalence, constants (π ≈ 3.14), or known scientific expressions (e.g., `ln(2)` ≈ 0.693, `RT` vs `8.314T`). |
| 4. Rounding & Notation      | 0.10   | Accept differences in decimal points, scientific notation, or minor formatting. |
| 5. Structural/Scientific Equivalence | 0.20 | Applies to alternate representations that are scientifically identical (e.g., `ΔH = +25 kJ/mol` vs `Enthalpy change = 25 kJ/mol`). |

---

---

###  Final Decision Rule

- If total score **≥ 0.80**, return: `"decision": "similar"`
- If total score **< 0.80**, return: `"decision": "different"`

---

###  Example:

**Answer 1:** `1/√2 m/s`  
**Answer 2:** `0.707 m/s`
Evaluation:

- Units: m/s = m/s → 0.25
- Numeric: 1/√2 ≈ 0.707 → 0.30
- Symbolic/Decimal: equivalent → 0.15
- Rounding: acceptable → 0.10
- Expression: scalar match → 0.20

**Total score = 1.00 → "similar"**

**Answer 1:** `ΔG = -45.2 kJ/mol`  
**Answer 2:** `Gibbs free energy = -45.2 kJ/mol`  
→ Score = 1.00 → "similar"

**Answer 1:** `C₆H₁₂O₆ + 6O₂ → 6CO₂ + 6H₂O`  
**Answer 2:** `glucose reacts with oxygen to form carbon dioxide and water`  
→ Score = ~0.85 → "similar"

---

### Your Task

Compare the following two answers and return a **valid JSON object** in this format:

{{
  "similarity_score": <float between 0.0 and 1.0>,
  "decision": "similar" or "different",
  "comment": "<brief technical explanation>"
}}

Only return the JSON object. Do NOT include Markdown, LaTeX formatting, or extra commentary.
### Input:
Answer 1: {ans1}  
Answer 2: {ans2}
"""
    response = call_model_openai(prompt, MODEL_gpt)
    return json.loads(response)


In [6]:
# === STEP: Compare all model answers ===
def compare_all_model_answers(answer_dict: dict) -> list:
    comparison_results = []
    for (name1, ans1), (name2, ans2) in combinations(answer_dict.items(), 2):
        result = compare_answers(ans1, ans2)
        result.update({"model_1": name1, "model_2": name2})
        comparison_results.append(result)
    return comparison_results

In [7]:
# === Evaluate agreement and label difficulty ===
def evaluate_difficulty(comparison_results: list, answer_dict: dict) -> dict:
    total = len(comparison_results)
    similar = sum(1 for r in comparison_results if r["decision"] == "similar")
    avg_score = sum(r["similarity_score"] for r in comparison_results) / total
    disagreement_ratio = 1 - (similar / total)
    unique_answers = len(set(answer_dict.values()))

    if disagreement_ratio > 0.8:
        label = "hard"
    elif disagreement_ratio > 0.5:
        label = "medium"
    else:
        label = "easy"

    return {
        "agreement_ratio": round(similar / total, 2),
        "average_similarity": round(avg_score, 3),
        "disagreement_ratio": round(disagreement_ratio, 2),
        "difficulty_label": label,
        "unique_answer_count": unique_answers
    }

In [8]:
# === MAIN PIPELINE ===
def process_question_row(question_text):
    try:
        answer_dict = get_final_answer(question_text)
        comparison_results = compare_all_model_answers(answer_dict)
        difficulty_info = evaluate_difficulty(comparison_results, answer_dict)

        row_data = {
            "gpt_o3_answer": answer_dict.get("gpt_o3"),
            "gpt_4_1_answer": answer_dict.get("gpt_4_1"),
            "claude_sonnet_4_answer": answer_dict.get("claude_sonnet_4"),
            "gemini_2_5_pro_answer": answer_dict.get("gemini_2_5_pro"),
            "gemini_flash_answer": answer_dict.get("gemini_flash"),
            "disagreement_ratio": difficulty_info["disagreement_ratio"],
            "agreement_ratio": difficulty_info["agreement_ratio"],
            "difficulty_label": difficulty_info["difficulty_label"],
            "unique_answer_count": difficulty_info["unique_answer_count"],
            "comparison_json": json.dumps(comparison_results)
        }
        return row_data

    except Exception as e:
        return {
            "gpt_o3_answer": "error",
            "gpt_4_1_answer": "error",
            "claude_sonnet_4_answer": "error",
            "gemini_2_5_pro_answer": "error",
            "gemini_flash_answer": "error",
            "disagreement_ratio": "error",
            "agreement_ratio": "error",
            "difficulty_label": "error",
            "unique_answer_count": "error",
            "comparison_json": str(e)
        }


In [9]:
# === File Runner ===
def run_verification_pipeline(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path, encoding='utf-8')

    print(f"Processing {len(df)} questions...")
    results = []
    for idx, row in df.iterrows():
        print(f"Processing Q{idx+1}")
        row_data = process_question_row(row["question"])
        results.append({**row, **row_data})

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv_path, index=False)
    print(f" Saved results to {output_csv_path}")

In [10]:
run_verification_pipeline("testing.csv", "testing_phds_output.csv")


Processing 37 questions...
Processing Q1


Processing Q2
Processing Q3
Processing Q4
Processing Q5
Processing Q6
Processing Q7
Processing Q8
Processing Q9
Processing Q10
Processing Q11
Processing Q12
Processing Q13
Processing Q14
Processing Q15
Processing Q16
Processing Q17
Processing Q18
Processing Q19
Processing Q20
Processing Q21
Processing Q22
Processing Q23
Processing Q24
Processing Q25
Processing Q26
Processing Q27
Processing Q28
Processing Q29
Processing Q30
Processing Q31
Processing Q32
Processing Q33
Processing Q34
Processing Q35
Processing Q36
Processing Q37
 Saved results to testing_phds_output.csv
