In [None]:
!pip install -q anthropic

import json
import time
from tqdm.auto import tqdm
import anthropic

# Your API key
ANTHROPIC_API_KEY = ""

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# Models to profile
MODELS = {
    "claude-haiku-4-5": "claude-haiku-4-5-20251001",
    "claude-opus-4-5": "claude-opus-4-5-20251101"
}

print("✅ Setup complete")

✅ Setup complete


In [3]:
# Load the evaluation data from clustering step
with open("evaluation_data.json", "r") as f:
    evaluation_data = json.load(f)

print(f"Loaded evaluation data:")
print(f"  Clusters: {len(evaluation_data['clusters'])}")
print(f"  Total samples: {evaluation_data['metadata']['total_samples']}")

for cid, cdata in evaluation_data['clusters'].items():
    print(f"  Cluster {cid}: {cdata['n_samples']} questions")

Loaded evaluation data:
  Clusters: 7
  Total samples: 1351
  Cluster 0: 27 questions
  Cluster 1: 34 questions
  Cluster 2: 20 questions
  Cluster 3: 29 questions
  Cluster 4: 25 questions
  Cluster 5: 98 questions
  Cluster 6: 34 questions


In [4]:
def ask_claude(model_id, question, choices):
    """Ask Claude a multiple choice question and return the answer index."""

    choices_text = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)])

    prompt = f"""Answer this multiple choice question. Reply with ONLY the letter (A, B, C, or D).

Question: {question}

{choices_text}

Answer:"""

    try:
        response = client.messages.create(
            model=model_id,
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        answer_text = response.content[0].text.strip().upper()

        # Parse answer letter to index
        if "A" in answer_text:
            return 0
        elif "B" in answer_text:
            return 1
        elif "C" in answer_text:
            return 2
        elif "D" in answer_text:
            return 3
        else:
            return -1  # Could not parse

    except Exception as e:
        print(f"Error: {e}")
        return -1


def evaluate_model_on_cluster(model_name, model_id, cluster_data, delay=0.5):
    """Evaluate a model on all questions in a cluster."""

    correct = 0
    total = 0
    results = []

    for q in tqdm(cluster_data["questions"], desc=f"{model_name}", leave=False):
        prediction = ask_claude(model_id, q["question"], q["choices"])
        is_correct = prediction == q["answer"]

        if prediction != -1:  # Only count valid responses
            if is_correct:
                correct += 1
            total += 1

        results.append({
            "question_id": q["id"],
            "predicted": prediction,
            "actual": q["answer"],
            "correct": is_correct
        })

        time.sleep(delay)  # Rate limiting

    accuracy = correct / total if total > 0 else 0

    return {
        "accuracy": accuracy,
        "correct": correct,
        "total": total,
        "results": results
    }

In [5]:
# Store profiles
profiles = {}

for model_name, model_id in MODELS.items():
    print(f"\n{'='*50}")
    print(f"Profiling: {model_name} ({model_id})")
    print(f"{'='*50}")

    profiles[model_name] = {
        "model_id": model_id,
        "clusters": {}
    }

    for cluster_id, cluster_data in evaluation_data["clusters"].items():
        print(f"\nCluster {cluster_id} ({cluster_data['n_samples']} questions):")

        result = evaluate_model_on_cluster(
            model_name,
            model_id,
            cluster_data,
            delay=0.3  # Adjust based on rate limits
        )

        profiles[model_name]["clusters"][cluster_id] = {
            "accuracy": result["accuracy"],
            "correct": result["correct"],
            "total": result["total"]
        }

        print(f"  Accuracy: {result['accuracy']:.2%} ({result['correct']}/{result['total']})")

print("\n✅ Profiling complete!")


Profiling: claude-haiku-4-5 (claude-haiku-4-5-20251001)

Cluster 0 (27 questions):


claude-haiku-4-5:   0%|          | 0/27 [00:00<?, ?it/s]

  Accuracy: 85.19% (23/27)

Cluster 1 (34 questions):


claude-haiku-4-5:   0%|          | 0/34 [00:00<?, ?it/s]

  Accuracy: 79.41% (27/34)

Cluster 2 (20 questions):


claude-haiku-4-5:   0%|          | 0/20 [00:00<?, ?it/s]

  Accuracy: 100.00% (20/20)

Cluster 3 (29 questions):


claude-haiku-4-5:   0%|          | 0/29 [00:00<?, ?it/s]

  Accuracy: 96.55% (28/29)

Cluster 4 (25 questions):


claude-haiku-4-5:   0%|          | 0/25 [00:00<?, ?it/s]

  Accuracy: 96.00% (24/25)

Cluster 5 (98 questions):


claude-haiku-4-5:   0%|          | 0/98 [00:00<?, ?it/s]

  Accuracy: 72.45% (71/98)

Cluster 6 (34 questions):


claude-haiku-4-5:   0%|          | 0/34 [00:00<?, ?it/s]

  Accuracy: 91.18% (31/34)

Profiling: claude-opus-4-5 (claude-opus-4-5-20251101)

Cluster 0 (27 questions):


claude-opus-4-5:   0%|          | 0/27 [00:00<?, ?it/s]

  Accuracy: 92.59% (25/27)

Cluster 1 (34 questions):


claude-opus-4-5:   0%|          | 0/34 [00:00<?, ?it/s]

  Accuracy: 85.29% (29/34)

Cluster 2 (20 questions):


claude-opus-4-5:   0%|          | 0/20 [00:00<?, ?it/s]

  Accuracy: 100.00% (20/20)

Cluster 3 (29 questions):


claude-opus-4-5:   0%|          | 0/29 [00:00<?, ?it/s]

  Accuracy: 96.55% (28/29)

Cluster 4 (25 questions):


claude-opus-4-5:   0%|          | 0/25 [00:00<?, ?it/s]

  Accuracy: 100.00% (25/25)

Cluster 5 (98 questions):


claude-opus-4-5:   0%|          | 0/98 [00:00<?, ?it/s]

  Accuracy: 67.35% (66/98)

Cluster 6 (34 questions):


claude-opus-4-5:   0%|          | 0/34 [00:00<?, ?it/s]

  Accuracy: 97.06% (33/34)

✅ Profiling complete!


In [6]:
# Save profiles to JSON
with open("llm_profiles.json", "w") as f:
    json.dump(profiles, f, indent=2)

print("✅ Saved: llm_profiles.json")

# Display summary
print("\n" + "="*60)
print("MODEL PROFILES SUMMARY")
print("="*60)

for model_name, profile in profiles.items():
    print(f"\n{model_name.upper()}:")
    total_correct = 0
    total_questions = 0

    for cid, cdata in profile["clusters"].items():
        print(f"  Cluster {cid}: {cdata['accuracy']:.2%} ({cdata['correct']}/{cdata['total']})")
        total_correct += cdata["correct"]
        total_questions += cdata["total"]

    overall = total_correct / total_questions if total_questions > 0 else 0
    print(f"  OVERALL: {overall:.2%}")

✅ Saved: llm_profiles.json

MODEL PROFILES SUMMARY

CLAUDE-HAIKU-4-5:
  Cluster 0: 85.19% (23/27)
  Cluster 1: 79.41% (27/34)
  Cluster 2: 100.00% (20/20)
  Cluster 3: 96.55% (28/29)
  Cluster 4: 96.00% (24/25)
  Cluster 5: 72.45% (71/98)
  Cluster 6: 91.18% (31/34)
  OVERALL: 83.90%

CLAUDE-OPUS-4-5:
  Cluster 0: 92.59% (25/27)
  Cluster 1: 85.29% (29/34)
  Cluster 2: 100.00% (20/20)
  Cluster 3: 96.55% (28/29)
  Cluster 4: 100.00% (25/25)
  Cluster 5: 67.35% (66/98)
  Cluster 6: 97.06% (33/34)
  OVERALL: 84.64%
