In [None]:
import json
import torch
from tqdm import tqdm
from collections import Counter

from transformers import AutoTokenizer, AutoModelForCausalLM

# ---------------- CONFIG ----------------

MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
MAX_NEW_TOKENS = 120

TEXT_PATH  = "cache/joke_explanations_qwen.jsonl"
AUDIO_PATH = "cache/joke_explanations_qwen_audio.jsonl"

OUT_PATH = "cache/joke_judge_independent_scoring.jsonl"

# ---------------- SCORING PROMPT ----------------

JUDGE_PROMPT = """You are a strict evaluator of joke explanations.

Task:
Evaluate how well the explanation explains WHY the joke is humorous.

Evaluation criteria:
- Correct identification of the humor mechanism (e.g., wordplay, ambiguity, violated expectation)
- Clarity and correctness
- No hallucinated or irrelevant mechanisms

Score the explanation on a scale from 1 to 5:
1 = very poor
2 = poor
3 = acceptable
4 = good
5 = excellent

Return ONLY valid JSON in exactly this format:
{{"Score": 1 | 2 | 3 | 4 | 5,
 "Reason": "<short justification>"}}

Joke:
{joke}

Explanation:
{exp}
"""

# --------------- MODEL ----------------

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
).eval()

# --------------- HELPERS ----------------

def load_map(path):
    with open(path, encoding="utf-8") as f:
        return {obj["id"]: obj for obj in map(json.loads, f)}

def safe_json_extract(text):
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                try:
                    return json.loads(text[start:i+1])
                except Exception:
                    return None
    return None

def generate_score(prompt: str):
    messages = [
        {"role": "system", "content": "You are a judge that outputs ONLY valid JSON."},
        {"role": "user", "content": prompt},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True,
    ).to(device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    prompt_len = inputs["input_ids"].shape[1]
    decoded = tokenizer.decode(
        out[0][prompt_len:],
        skip_special_tokens=True,
    )

    parsed = safe_json_extract(decoded)
    if parsed is None:
        return {"Score": 0, "Reason": "Parse failure"}

    return parsed

# --------------- RUN -----------------

def run_judge():
    print("=== Loading explanations ===")

    text_items = load_map(TEXT_PATH)
    audio_items = load_map(AUDIO_PATH)

    ids = sorted(set(text_items) & set(audio_items))
    votes = Counter()

    print(f"Scoring {len(ids)} joke pairs")

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        for i in tqdm(ids, desc="Judging"):
            t = text_items[i]
            a = audio_items[i]

            judge_text = generate_score(
                JUDGE_PROMPT.format(
                    joke=t["Joke"],
                    exp=t["Explanation"],
                )
            )

            judge_audio = generate_score(
                JUDGE_PROMPT.format(
                    joke=a["Joke"],
                    exp=a["Explanation"],
                )
            )

            s_text = int(judge_text.get("Score", 0))
            s_audio = int(judge_audio.get("Score", 0))

            if s_text > s_audio:
                final = "text"
            elif s_audio > s_text:
                final = "audio"
            else:
                final = "tie"

            votes[final] += 1

            out = {
                "id": i,
                "text_score": s_text,
                "audio_score": s_audio,
                "final_decision": final,
                "text_judge": judge_text,
                "audio_judge": judge_audio,
            }

            f.write(json.dumps(out, ensure_ascii=False) + "\n")

    # ------------- STATS -------------

    total = sum(votes.values())
    print("\n=== RESULTS ===")
    for k, v in votes.items():
        pct = (v / total * 100) if total else 0.0
        print(f"{k}: {v} ({pct:.1f}%)")

    print("\nWrote:", OUT_PATH)

# ------------- MAIN --------------

if __name__ == "__main__":
    run_judge()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

=== Loading explanations ===
Scoring 377 joke pairs


Judging:   0%|          | 0/377 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Judging: 100%|██████████| 377/377 [52:01<00:00,  8.28s/it]


=== RESULTS ===
tie: 275 (72.9%)
audio: 39 (10.3%)
text: 63 (16.7%)

Wrote: cache/joke_judge_independent_scoring.jsonl



