In [24]:
import os
import re
import json
import time
from pathlib import Path
from typing import List, Dict, Any, Optional

import requests
import pandas as pd
from tqdm import tqdm  # console progress bar
import numpy as np

OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1")

USE_CONTEXT = False          
ACCEPT_THRESHOLD = 0.0       
REQUEST_TIMEOUT_S = 30       
MAX_RETRIES = 2              
HEARTBEAT_EVERY = 25         

def _clean(s: Optional[str]) -> str:
    return (s or "").strip()

def _to_list(x: Any) -> List[Any]:
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]


In [25]:

def load_data(path: str) -> List[Dict[str, Any]]:
    """
    Load a dataset file that is either:
      - a single JSON object,
      - a list of JSON objects,
      - or a JSONL file (one object per line).
    Returns a list of objects.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)
    if p.suffix.lower() == ".jsonl":
        rows = []
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    rows.append(json.loads(line))
        return rows
    else:
        obj = json.loads(p.read_text(encoding="utf-8"))
        if isinstance(obj, list):
            return obj
        return [obj]

def extract_targets(obj: Dict[str, Any]) -> List[str]:
    # "target" is a list of acceptable strings
    return [_clean(t) for t in _to_list(obj.get("target", [])) if _clean(t)]

def extract_candidates(obj: Dict[str, Any]) -> List[str]:
    # "clarified_all_ans" is a List[List[str]]; flatten it
    out: List[str] = []
    blocks = _to_list(obj.get("clarified_all_ans", []))
    for block in blocks:
        for s in _to_list(block):
            s_clean = _clean(s)
            if s_clean:
                out.append(s_clean)
    return out

def to_context(obj: Dict[str, Any]) -> str:
    # Kept for compatibility; NOT used when USE_CONTEXT=False
    return _clean(obj.get("input", ""))


In [26]:
JUDGE_SYSTEM_PROMPT = (
    "You judge whether CANDIDATE is semantically equivalent to TARGET.\n"
    "Focus on meaning, not wording. Ignore extra fluff.\n"
    'Return ONLY JSON: {"equivalent": true|false, "rationale": "..."}\n'
    "- TRUE only if a grader would accept CANDIDATE in place of TARGET with no change of meaning.\n"
    "- Minor rephrasing/synonyms → may be TRUE. Broader/narrower/related terms → FALSE.\n"
)

SCORE_SYSTEM_PROMPT = (
    "You assign a semantic similarity score between TARGET and CANDIDATE.\n"
    "Focus purely on meaning (0.0 = completely different, 1.0 = identical in meaning).\n"
    'Return ONLY JSON: {"score": 0..1, "rationale": "..."}\n'
    "- 1.0 if they express the same meaning with no substantive difference.\n"
    "- ~0.7–0.9 if they are very similar but not identical.\n"
    "- ~0.3–0.6 if partially related.\n"
    "- 0.0–0.2 if mostly or completely unrelated.\n"
)

def build_judge_prompt(target: str, candidate: str, context: str = "", use_context: bool = False) -> str:
    """
    Builds the user message. If use_context=False, context is ignored.
    """
    if use_context:
        return (
            "CONTEXT:\n{ctx}\n\nTARGET:\n{tgt}\n\nCANDIDATE:\n{cand}\n\n"
            "Decide if CANDIDATE expresses the same meaning as TARGET in this context."
        ).format(ctx=context, tgt=target, cand=candidate)
    else:
        return (
            "TARGET:\n{tgt}\n\nCANDIDATE:\n{cand}\n\n"
            "Decide if CANDIDATE expresses the same meaning as TARGET."
        ).format(tgt=target, cand=candidate)


def build_score_prompt(target: str, candidate: str, context: str = "", use_context: bool = False) -> str:
    """
    Builds the user message for scoring. If use_context=False, context is ignored.
    """
    if use_context:
        return (
            "CONTEXT:\n{ctx}\n\nTARGET:\n{tgt}\n\nCANDIDATE:\n{cand}\n\n"
            "Assign a semantic similarity score (0.0–1.0) based on their meanings in this context."
        ).format(ctx=context, tgt=target, cand=candidate)
    else:
        return (
            "TARGET:\n{tgt}\n\nCANDIDATE:\n{cand}\n\n"
            "Assign a semantic similarity score (0.0–1.0) based on their meanings."
        ).format(tgt=target, cand=candidate)


In [27]:

def _post_chat(model: str, system: str, user: str, host: str, temperature: float = 0.0) -> str:
    """
    Calls Ollama /api/chat and returns the assistant message content as a string.
    Raises on hard errors; caller handles retries.
    """
    url = host.rstrip("/") + "/api/chat"
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.post(
                url,
                json={
                    "model": model,
                    "messages": messages,
                    "stream": False,
                    "options": {"temperature": temperature},
                },
                timeout=REQUEST_TIMEOUT_S,
            )
            resp.raise_for_status()
            content = (resp.json().get("message", {}) or {}).get("content", "")
            return _clean(content)
        except Exception as e:
            if attempt == MAX_RETRIES:
                raise
            time.sleep(1.0)

def _parse_json_from_content(content: str) -> Dict[str, Any]:
    """
    Extracts the first {...} block and parses it as JSON. Returns {} on failure.
    """
    m = re.search(r"\{.*\}", content, flags=re.DOTALL)
    s = m.group(0) if m else content
    try:
        return json.loads(s)
    except Exception:
        return {}


In [28]:
def _judge_equivalent_flag(
    model: str,
    host: str,
    target: str,
    candidate: str,
    context: str = "",
    use_context: bool = USE_CONTEXT,
    temperature: float = 0.0,
) -> Dict[str, Any]:
    """
    First call: ask the LLM only for the binary semantic equivalence decision.
    Returns: {"equivalent": bool, "rationale": str, "raw": str}
    """
    user = build_judge_prompt(target=target, candidate=candidate, context=context, use_context=use_context)
    content = _post_chat(model, JUDGE_SYSTEM_PROMPT, user, host, temperature=temperature)
    data = _parse_json_from_content(content)

    eq = bool(data.get("equivalent", False))
    rationale = _clean(data.get("rationale", ""))

    return {"equivalent": eq, "rationale": rationale, "raw": content}


def _judge_similarity_score(
    model: str,
    host: str,
    target: str,
    candidate: str,
    context: str = "",
    use_context: bool = USE_CONTEXT,
    temperature: float = 0.0,
) -> Dict[str, Any]:
    """
    Second call: ask the LLM only for the semantic similarity score.
    Returns: {"score": float, "rationale": str, "raw": str}
    """
    user = build_score_prompt(target=target, candidate=candidate, context=context, use_context=use_context)
    content = _post_chat(model, SCORE_SYSTEM_PROMPT, user, host, temperature=temperature)
    data = _parse_json_from_content(content)

    try:
        score = float(data.get("score", 0.0))
    except Exception:
        score = 0.0

    rationale = _clean(data.get("rationale", ""))

    return {"score": score, "rationale": rationale, "raw": content}


def judge_equivalence_once(
    model: str,
    host: str,
    target: str,
    candidate: str,
    context: str = "",
    use_context: bool = USE_CONTEXT,
    temperature: float = 0.0,
) -> Dict[str, Any]:
    """
    Ask the LLM in two separate calls:
      1) Decide semantic equivalence (boolean).
      2) Provide a semantic similarity score (0..1).

    Returns dict:
      {
        "equivalent": bool,         # after applying ACCEPT_THRESHOLD
        "score": float,
        "rationale": str,           # combined rationale (equiv + score)
        "raw": str                  # JSON string with both raw responses
      }
    """
    # First: binary decision
    eq_res = _judge_equivalent_flag(
        model=model,
        host=host,
        target=target,
        candidate=candidate,
        context=context,
        use_context=use_context,
        temperature=temperature,
    )

    # Second: score
    score_res = _judge_similarity_score(
        model=model,
        host=host,
        target=target,
        candidate=candidate,
        context=context,
        use_context=use_context,
        temperature=temperature,
    )

    eq = bool(eq_res.get("equivalent", False))
    score = float(score_res.get("score", 0.0))

    # Optional strictness gate *now* uses the separately obtained score
    if eq and score < ACCEPT_THRESHOLD:
        eq = False

    # Combine rationales in case you want both
    combined_rationale_parts = []
    if eq_res.get("rationale"):
        combined_rationale_parts.append("Equivalence rationale: " + eq_res["rationale"])
    if score_res.get("rationale"):
        combined_rationale_parts.append("Score rationale: " + score_res["rationale"])
    combined_rationale = " | ".join(combined_rationale_parts)

    # Pack both raw responses as JSON string (so type stays str)
    raw_combined = json.dumps(
        {
            "equivalence_call_raw": eq_res.get("raw", ""),
            "score_call_raw": score_res.get("raw", ""),
        },
        ensure_ascii=False,
    )

    return {
        "equivalent": eq,
        "score": score,
        "rationale": combined_rationale,
        "raw": raw_combined,
    }


In [29]:

def best_equivalence_against_targets(
    targets: List[str],
    candidate: str,
    model: str = OLLAMA_MODEL,
    host: str = OLLAMA_HOST,
    context: str = "",              # ignored when USE_CONTEXT=False
    use_context: bool = USE_CONTEXT,
) -> Dict[str, Any]:
    """
    Runs judge_equivalence_once for candidate against each target; returns the best one by
    (score, equivalent) descending.
    """
    results = []
    for tgt in targets:
        r = judge_equivalence_once(model, host, tgt, candidate, context=context, use_context=use_context)
        r["target"] = tgt
        results.append(r)

    # sort by score desc; if tie, prefer equivalent=True
    results.sort(key=lambda d: (d.get("score", 0.0), bool(d.get("equivalent", False))), reverse=True)
    best = results[0] if results else {"equivalent": False, "score": 0.0, "rationale": "", "target": "", "raw": ""}

    return {
        "best_equivalent": bool(best.get("equivalent", False)),
        "best_score": float(best.get("score", 0.0)),
        "best_rationale": _clean(best.get("rationale", "")),
        "best_target": _clean(best.get("target", "")),
        "judge_raw": best.get("raw", ""),
    }


In [30]:
def evaluate_file(
    path: str,
    out_csv: Optional[str] = None,
    model: str = OLLAMA_MODEL,
    host: str = OLLAMA_HOST,
    use_context: bool = False,   # keep False to ignore context
) -> pd.DataFrame:
    objs = load_data(path)
    out_rows: List[Dict[str, Any]] = []

    # We progress per candidate
    total_candidates = 0
    for o in objs:
        n = len(extract_candidates(o))
        total_candidates += (n if n > 0 else 1)
    if total_candidates == 0:
        total_candidates = 1

    with tqdm(total=total_candidates, desc="Judging answers", unit="cand") as pbar:
        for i, obj in enumerate(objs):
            uid = obj.get("id", "ex-{0}".format(i))
            context = to_context(obj)             # ignored unless use_context=True
            targets = extract_targets(obj)
            candidates = extract_candidates(obj)

            if not targets or not candidates:
                out_rows.append({
                    "id": uid,
                    "candidate": "",
                    "best_equivalent": False,
                    "best_score": 0.0,
                    "best_rationale": "Missing targets or candidates",
                    "best_target": "",
                    "context": context,
                })
                pbar.update(1)
                continue

            for cand in candidates:
                if pbar.n % HEARTBEAT_EVERY == 0:
                    print("[heartbeat] processed {0} candidates…".format(pbar.n), flush=True)

                best = best_equivalence_against_targets(
                    targets=targets,
                    candidate=cand,
                    model=model,
                    host=host,
                    context=context,
                    use_context=use_context,
                )
                out_rows.append({
                    "id": uid,
                    "candidate": cand,
                    "best_equivalent": best["best_equivalent"],
                    "best_score": best["best_score"],
                    "best_rationale": best["best_rationale"],
                    "best_target": best["best_target"],
                    "context": context,
                })
                pbar.update(1)

    df = pd.DataFrame(out_rows)
    if out_csv:
        Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(out_csv, index=False, encoding="utf-8")
    return df


In [31]:
squad_v2 = evaluate_file("squad_v2_forward.json", out_csv="squad_v2.csv")
trivia_qa = evaluate_file("trivia_qa_forward.json", out_csv="trivia_qa.csv")
truthful_qa = evaluate_file("truthful_qa_forward.json", out_csv="truthful_qa.csv")

FileNotFoundError: squad_v2_forward.json