In [None]:
import numpy as np
import os, json, tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Exact Reference rubrics

In [None]:
from datasets import Dataset

# ✅ Load filtered dataset from saved file
with open("outputs/filtered/rubrics_8_15.jsonl") as f:
    filtered_data = [json.loads(line) for line in f]

# ✅ Convert to HuggingFace Dataset object
filtered_dataset = Dataset.from_list(filtered_data)

In [None]:
# ✅ Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# ✅ Load reference rubrics
def extract_ref_rubrics(dataset, output_dir="outputs/ref_rubrics", max_items=None):
    """
    Extract reference rubrics from filtered_dataset and store them as the ref_rubrics_{i}.json file by index.

    Args:
    - dataset: filtered HuggingFace dataset from which to extract referecne rubrics
    - output_dir: save path
    - max_items: Limit the number of saves (optional)
    """
    os.makedirs(output_dir, exist_ok=True)

    saved = 0
    for i, example in tqdm(enumerate(dataset), desc="Saving reference rubrics"):
        if max_items and saved >= max_items:
            break

        rubrics = example.get("rubrics", None)
        if rubrics:
            out_path = os.path.join(output_dir, f"ref_rubrics_{i}.json")
            with open(out_path, "w") as f:
                json.dump(rubrics, f, indent=2, ensure_ascii=False)
            saved += 1

    print(f"✅ Saved {saved} reference rubrics to {output_dir}")

In [None]:
extract_ref_rubrics(filtered_dataset)

Evaluation 

In [None]:
# ✅ All possible axes
ALL_AXES = [
    "completeness",
    "accuracy",
    "context_awareness",
    "communication_quality",
    "instruction_following"
]

In [None]:
def evaluate_rubrics(conversation_id, threshold=0.5, allow_axis_mismatch=True):
    conversation_path = f"outputs/prompts/conversation_{conversation_id}.txt"
    generated_path = f"outputs/rubrics/rubrics_{conversation_id}.json"
    reference_path = f"outputs/ref_rubrics/ref_rubrics_{conversation_id}.json"
    os.makedirs("outputs/eval", exist_ok=True)
    os.makedirs("outputs/eval/matched_pair", exist_ok=True)
    os.makedirs("outputs/eval/eval", exist_ok=True)

    with open(conversation_path) as f:
        conversation = f.read().strip()

    with open(reference_path) as f:
        references = json.load(f)

    with open(generated_path) as f:
        generated = json.load(f)

    ref_axes = set()
    for r in references:
        for tag in r.get("tags", []):
            if tag.startswith("axis:"):
                ref_axes.add(tag.split(":")[-1])
    print(f"\n📊 Evaluation for Conversation #{conversation_id}")
    print(f"🔍 Conversation Content:\n{conversation[:300]}...")
    print(f"Similarity Threshold: {threshold}")
    print(f"Generated Rubrics: {len(generated)}")
    print(f"Reference Rubrics: {len(references)}")
    print(f"Axes in Reference: {sorted(ref_axes)}")

    match_results = []
    axis_stats = {axis: {"matched": 0, "score_sum": 0.0, "conversation_count": 0} for axis in ALL_AXES}
    axis_present = {axis: False for axis in ALL_AXES}

    for ref in references:
        ref_criterion = ref["criterion"]
        ref_point = ref.get("points", 0)
        ref_axis = next((tag.split(":")[-1] for tag in ref.get("tags", []) if tag.startswith("axis:")), None)

        if ref_axis:
            axis_present[ref_axis] = True

        candidates = generated if allow_axis_mismatch else [g for g in generated if g.get("axis") == ref_axis]
        if not candidates:
            continue

        ref_emb = embedder.encode([ref_criterion])
        gen_embs = embedder.encode([g["criterion"] for g in candidates])
        sims = cosine_similarity(ref_emb, gen_embs)[0]
        best_idx = int(np.argmax(sims))
        best_sim = float(sims[best_idx])
        best_gen = candidates[best_idx]
        gen_point = best_gen.get("point", 0)
        point_diff = abs(ref_point - gen_point)

        # weight = 1.0 if point_diff <= 2 else 0.5 if point_diff <= 5 else 0.0
        # ✅ New weighting logic
        if (ref_point >= 0 and gen_point >= 0) or (ref_point < 0 and gen_point < 0):
            weight = 0.5  # If the positive or negative scores are the same, give the base score first
        else:
            weight = 0.0  # A score of 0 will be given directly if the positive or negative values are inconsistent

        if point_diff <= 3:
            weight += 0.5  # Precise consistency
        elif point_diff <= 5:
            weight += 0.3  # Roughly close
        # If the score exceeds 5 points, no additional weight will be added

        match_score = best_sim * weight

        is_similar = best_sim >= threshold
        is_axis_match = (best_gen.get("axis") == ref_axis)

        if ref_axis:
            axis_stats[ref_axis]["score_sum"] += match_score
            axis_stats[ref_axis]["matched"] += 1
            axis_stats[ref_axis]["avg_score"] = (
                axis_stats[ref_axis]["score_sum"] / axis_stats[ref_axis]["matched"]
                if axis_stats[ref_axis]["matched"] > 0 else None
            )

        match_results.append({
            "ref_criterion": ref_criterion,
            "ref_axis": ref_axis,
            "ref_point": ref_point,
            "gen_criterion": best_gen["criterion"],
            "gen_axis": best_gen.get("axis"),
            "gen_point": gen_point,
            "similarity": round(best_sim, 4),
            "point_diff": point_diff,
            "score": round(match_score, 3),
            "similar_match": is_similar,
            "axis_match": is_axis_match
        })

    for axis in ref_axes:
        axis_stats[axis]["conversation_count"] += 1

    matched_count = len(match_results)
    total_ref = len(references)
    avg_score = np.mean([r["score"] for r in match_results]) if match_results else 0.0
    avg_point_diff = np.mean([r["point_diff"] for r in match_results]) if match_results else 0.0

    print(f"\n✅ Matched: {matched_count} / {total_ref}")
    print(f"Average Matching Score: {avg_score:.3f}")
    print(f"Average Point Difference: {avg_point_diff:.2f}")

    print("\n📊 Axis-level Breakdown:")
    for axis in ALL_AXES:
        stat = axis_stats[axis]
        if stat["conversation_count"] == 0:
            print(f"- {axis:25s} | ❌ Missing in this conversation")
        else:
            avg = stat["score_sum"] / stat["matched"] if stat["matched"] > 0 else 0.0
            print(f"- {axis:25s} | Matched: {stat['matched']:2d} | Avg Score: {avg:.3f}")

    # Save
    with open(f"outputs/eval/matched_pair/matched_pairs_{conversation_id}.json", "w") as f:
        json.dump(match_results, f, indent=2)

    summary = {
        "conversation_id": conversation_id,
        "matched": matched_count,
        "total_reference": total_ref,
        "avg_score": avg_score,
        "avg_point_diff": avg_point_diff,
        "axis_stats": axis_stats
    }
    with open(f"outputs/eval/eval/eval_{conversation_id}.json", "w") as f:
        json.dump(summary, f, indent=2)

    return summary

In [None]:
# example
evaluate_rubrics(conversation_id=0)

In [None]:
def batch_evaluate_rubrics(total_count, verbose=False):
    os.makedirs("outputs/eval/summary", exist_ok=True)
    
    all_scores = []
    skipped = 0
    
    axis_global = {
        axis: {"score_sum": 0.0, "matched": 0, "count": 0} for axis in ALL_AXES
    }

    for i in tqdm(range(total_count), desc="Evaluating rubrics"):
        gen_path = f"outputs/rubrics/rubrics_{i}.json"
        ref_path = f"outputs/ref_rubrics/ref_rubrics_{i}.json"

        # 💡 1. Skip if generated rubrics or reference rubrics don't exist
        if not (os.path.exists(gen_path) and os.path.exists(ref_path)):
            skipped += 1
            continue

        # 💡 2. Skip if generated rubrics is empty or only contains empty list / dict
        with open(gen_path) as f:
            gen_rubrics = json.load(f)
        if not gen_rubrics or (isinstance(gen_rubrics, list) and len(gen_rubrics) == 0):
            skipped += 1
            continue

        try:
            summary = evaluate_rubrics(i)
            all_scores.append(summary)
            
            axis_stats = summary.get("axis_stats", {})
            for axis in ALL_AXES:
                axis_info = axis_stats.get(axis, {})
                score_sum = axis_info.get("score_sum", 0.0)
                matched = axis_info.get("matched", 0)
                count = axis_info.get("conversation_count", 0)

                axis_global[axis]["score_sum"] += score_sum
                axis_global[axis]["matched"] += matched
                axis_global[axis]["count"] += count
            
        except Exception as e:
            print(f"❌ Evaluation failed for conversation {i}: {e}")
            skipped += 1

    # ✅ Compute overall metrics
    total_evals = len(all_scores)
    if total_evals == 0:
        print("⚠️ No successful evaluations.")
        return
    
    axis_global_avg = {
        axis: round(axis_global[axis]["score_sum"] / axis_global[axis]["matched"], 4)
        if axis_global[axis]["matched"] > 0 else None
        for axis in ALL_AXES
    }

    avg_score = np.mean([s["avg_score"] for s in all_scores])
    avg_point_diff = np.mean([s["avg_point_diff"] for s in all_scores])
    total_matched = sum([s["matched"] for s in all_scores])
    total_ref = sum([s["total_reference"] for s in all_scores])

    print(f"\n🎯 Evaluation completed.")
    print(f"✅ Successfully evaluated: {total_evals}")
    print(f"⛔ Skipped: {skipped}")
    print(f"📊 Overall avg match score: {avg_score:.3f}")
    print(f"📉 Overall avg point difference: {avg_point_diff:.2f}")
    print(f"🔢 Total matched / total reference: {total_matched} / {total_ref} ({total_matched/total_ref:.2%})")

    # save summary
    with open("outputs/eval/summary/batch_eval_summary.json", "w") as f:
        json.dump({
            "evaluated": total_evals,
            "skipped": skipped,
            "overall_avg_score": avg_score,
            "overall_avg_point_diff": avg_point_diff,
            "global_axis_scores": axis_global_avg,
            "per_axis_stats": axis_global,
            "total_matched": total_matched,
            "total_reference": total_ref,
        }, f, indent=2)

    return all_scores


In [None]:
batch_evaluate_rubrics(total_count=2735)
