In [11]:
!pip install -q anthropic

import json
import random
import anthropic

ANTHROPIC_API_KEY = ""

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

print("✅ Setup complete")

✅ Setup complete


In [23]:
# 2 Very Different Topics - Binary Choices (A or B)
TOPICS = {
    "animals": {
        "description": "Simple true/false or yes/no facts about common animals",
        "examples": [
            "Does a dog have 4 legs? (Yes/No)",
            "Is a whale a fish? (Yes/No)",
            "Can birds fly? (Yes/No)"
        ]
    },
    "colors": {
        "description": "Simple true/false or yes/no questions about colors",
        "examples": [
            "Is the sky blue? (Yes/No)",
            "Is grass red? (Yes/No)",
            "Is a banana yellow? (Yes/No)"
        ]
    }
}

QUESTIONS_PER_TOPIC = 50
print(f"Will generate {QUESTIONS_PER_TOPIC} questions per topic")
print(f"Topics: {list(TOPICS.keys())}")
print(f"Choices per question: 2 (A or B)")

Will generate 50 questions per topic
Topics: ['animals', 'colors']
Choices per question: 2 (A or B)


In [24]:
!pip install -q langchain langchain-anthropic pydantic

from typing import List
from pydantic import BaseModel, Field
from langchain_anthropic import ChatAnthropic

# Define structured output schema - only 2 choices
class Question(BaseModel):
    """A binary multiple choice question."""
    question: str = Field(description="The question text")
    choices: List[str] = Field(description="Exactly 2 answer choices", min_length=2, max_length=2)
    answer: int = Field(description="Index of correct answer (0 or 1)", ge=0, le=1)

class QuestionSet(BaseModel):
    """A set of generated questions."""
    questions: List[Question] = Field(description="List of questions")

# Initialize LLM with structured output
llm = ChatAnthropic(
    model="claude-sonnet-4-20250514",
    api_key=ANTHROPIC_API_KEY,
    temperature=0.7
)

structured_llm = llm.with_structured_output(QuestionSet)

def generate_questions(topic_name: str, topic_info: dict, n_questions: int = 10) -> List[dict]:
    """Generate binary questions with structured output."""

    prompt = f"""Generate {n_questions} VERY EASY binary (2-choice) questions about: {topic_info['description']}

IMPORTANT RULES:
1. Questions must be EXTREMELY SIMPLE - answerable by a small AI model
2. Each question has EXACTLY 2 choices (A and B only)
3. Use Yes/No, True/False, or two simple options
4. Only ONE correct answer
5. No tricks, no ambiguity
6. Use simple vocabulary

Example format:
- Question: "Is the sky blue?"
  Choices: ["Yes", "No"]
  Answer: 0 (Yes)

- Question: "Do cats bark?"
  Choices: ["Yes", "No"]
  Answer: 1 (No)

- Question: "What color is grass?"
  Choices: ["Green", "Purple"]
  Answer: 0 (Green)

Generate {n_questions} simple 2-choice questions now:"""

    result = structured_llm.invoke(prompt)

    # Convert to dict and add topic
    questions = []
    for q in result.questions:
        # Validate only 2 choices
        if len(q.choices) == 2 and q.answer in [0, 1]:
            questions.append({
                "question": q.question,
                "choices": q.choices,
                "answer": q.answer,
                "topic": topic_name
            })

    return questions

print("✅ Generator ready (2 choices per question)")

✅ Generator ready (2 choices per question)


In [25]:
all_questions = []

for topic_name, topic_info in TOPICS.items():
    print(f"\n{'='*50}")
    print(f"Generating: {topic_name}")
    print(f"{'='*50}")

    batch_size = 15
    generated = 0

    while generated < QUESTIONS_PER_TOPIC:
        n = min(batch_size, QUESTIONS_PER_TOPIC - generated)
        print(f"  Batch: generating {n} questions...")

        questions = generate_questions(topic_name, topic_info, n)

        if questions:
            all_questions.extend(questions)
            generated += len(questions)
            print(f"  ✅ Got {len(questions)} questions (total: {generated})")
        else:
            print(f"  ⚠️ Failed, retrying...")

print(f"\n✅ Total questions generated: {len(all_questions)}")
print(f"  Animals: {sum(1 for q in all_questions if q['topic'] == 'animals')}")
print(f"  Colors: {sum(1 for q in all_questions if q['topic'] == 'colors')}")


Generating: animals
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 15)
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 30)
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 45)
  Batch: generating 5 questions...
  ✅ Got 5 questions (total: 50)

Generating: colors
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 15)
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 30)
  Batch: generating 15 questions...
  ✅ Got 15 questions (total: 45)
  Batch: generating 5 questions...
  ✅ Got 5 questions (total: 50)

✅ Total questions generated: 100
  Animals: 50
  Colors: 50


In [26]:
import random
import pandas as pd

random.shuffle(all_questions)

# Preview
print("="*50)
print("SAMPLE QUESTIONS (2 choices)")
print("="*50)

for topic in TOPICS.keys():
    topic_qs = [q for q in all_questions if q["topic"] == topic]
    print(f"\n{topic.upper()}:")
    for q in topic_qs[:3]:
        correct = q["choices"][q["answer"]]
        print(f"  Q: {q['question']}")
        print(f"  A: {q['choices']} -> {correct}")

# Save
dataset = {
    "metadata": {
        "total": len(all_questions),
        "topics": list(TOPICS.keys()),
        "choices_per_question": 2
    },
    "questions": all_questions
}

with open("easy_dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

print("\n✅ Saved: easy_dataset.json")

# CSV
df = pd.DataFrame([{
    "question": q["question"],
    "A": q["choices"][0],
    "B": q["choices"][1],
    "answer": "A" if q["answer"] == 0 else "B",
    "topic": q["topic"]
} for q in all_questions])

df.to_csv("easy_dataset.csv", index=False)
print("✅ Saved: easy_dataset.csv")

df.head(10)

SAMPLE QUESTIONS (2 choices)

ANIMALS:
  Q: Do birds have wings?
  A: ['Yes', 'No'] -> Yes
  Q: Do fish live in water?
  A: ['Yes', 'No'] -> Yes
  Q: Do birds have wings?
  A: ['Yes', 'No'] -> Yes

COLORS:
  Q: Are leaves usually purple?
  A: ['True', 'False'] -> False
  Q: Is water blue?
  A: ['True', 'False'] -> True
  Q: Is snow black?
  A: ['True', 'False'] -> False

✅ Saved: easy_dataset.json
✅ Saved: easy_dataset.csv


Unnamed: 0,question,A,B,answer,topic
0,Are leaves usually purple?,True,False,B,colors
1,Do birds have wings?,Yes,No,A,animals
2,Is water blue?,True,False,A,colors
3,Do fish live in water?,Yes,No,A,animals
4,Do birds have wings?,Yes,No,A,animals
5,Do cows give milk?,Yes,No,A,animals
6,Do horses have four legs?,Yes,No,A,animals
7,Is snow black?,True,False,B,colors
8,Do cows give milk?,Yes,No,A,animals
9,Can ducks swim?,Yes,No,A,animals


In [30]:
import json
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

# Load model
print("Loading Qwen3-0.6B...")
model_id = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
model.eval()
print("✅ Model loaded")

# Load questions
with open("easy_dataset.json", "r") as f:
    dataset = json.load(f)

questions = dataset["questions"]
print(f"Total questions: {len(questions)}")

def ask_qwen(question, choices, debug=False):
    """Ask Qwen a binary question."""

    # Simple direct prompt
    prompt = f"""Question: {question}
A. {choices[0]}
B. {choices[1]}

Answer (A or B):"""

    # Try without chat template first (simpler)
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            temperature=None,
            top_p=None,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode only new tokens
    new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    if debug:
        print(f"  Prompt: {prompt[:100]}...")
        print(f"  Raw response: '{response}'")
        print(f"  Response repr: {repr(response)}")

    # Parse - check full response for A or B
    response_upper = response.upper()

    # Method 1: Check if starts with A or B
    response_clean = response_upper.strip()
    if response_clean.startswith("A"):
        return 0, response
    elif response_clean.startswith("B"):
        return 1, response

    # Method 2: Check if contains A or B anywhere
    if "A" in response_upper and "B" not in response_upper:
        return 0, response
    elif "B" in response_upper and "A" not in response_upper:
        return 1, response

    # Method 3: Check for Yes/No (if choices are Yes/No)
    if "YES" in response_upper:
        if choices[0].upper() == "YES":
            return 0, response
        elif choices[1].upper() == "YES":
            return 1, response
    if "NO" in response_upper:
        if choices[0].upper() == "NO":
            return 0, response
        elif choices[1].upper() == "NO":
            return 1, response

    return -1, response

# ============================================================
# DEBUG: Test on first 5 questions
# ============================================================
print("\n" + "="*60)
print("DEBUG: First 5 questions")
print("="*60)

for i, q in enumerate(questions[:5]):
    print(f"\n--- Question {i+1} ---")
    print(f"Q: {q['question']}")
    print(f"Choices: A. {q['choices'][0]} | B. {q['choices'][1]}")
    print(f"Correct: {chr(65 + q['answer'])}")

    pred, raw = ask_qwen(q["question"], q["choices"], debug=True)

    if pred != -1:
        status = "✅" if pred == q["answer"] else "❌"
        print(f"Predicted: {chr(65 + pred)} {status}")
    else:
        print(f"Predicted: FAILED TO PARSE")

# ============================================================
# Full evaluation
# ============================================================
print("\n" + "="*60)
print("Running full evaluation...")
print("="*60)

correct = 0
total = 0
failed = 0
failed_responses = []
results_by_topic = {}

for q in tqdm(questions, desc="Evaluating"):
    topic = q["topic"]
    pred, raw = ask_qwen(q["question"], q["choices"])

    if topic not in results_by_topic:
        results_by_topic[topic] = {"correct": 0, "total": 0, "failed": 0}

    if pred == -1:
        failed += 1
        results_by_topic[topic]["failed"] += 1
        if len(failed_responses) < 5:  # Store first 5 failures
            failed_responses.append({"question": q["question"], "response": raw})
    else:
        total += 1
        results_by_topic[topic]["total"] += 1
        if pred == q["answer"]:
            correct += 1
            results_by_topic[topic]["correct"] += 1

# ============================================================
# Results
# ============================================================
print("\n" + "="*60)
print("RESULTS: Qwen3-0.6B (Binary Questions)")
print("="*60)

print(f"\nParsed: {total}/{len(questions)}")
print(f"Failed to parse: {failed}/{len(questions)}")

if total > 0:
    print(f"\nAccuracy: {correct}/{total} ({100*correct/total:.1f}%)")
    print("\nBy Topic:")
    for topic, res in results_by_topic.items():
        if res["total"] > 0:
            acc = 100 * res["correct"] / res["total"]
            print(f"  {topic}: {res['correct']}/{res['total']} ({acc:.1f}%) | Failed: {res['failed']}")
        else:
            print(f"  {topic}: No valid | Failed: {res['failed']}")
else:
    print("\n⚠️ No valid responses!")

if failed_responses:
    print("\n" + "="*60)
    print("FAILED RESPONSES (first 5):")
    print("="*60)
    for fr in failed_responses:
        print(f"  Q: {fr['question'][:50]}...")
        print(f"  R: '{fr['response']}'")
        print()

torch.cuda.empty_cache()


Device: cuda
Loading Qwen3-0.6B...


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model loaded
Total questions: 100

DEBUG: First 5 questions

--- Question 1 ---
Q: Are leaves usually purple?
Choices: A. True | B. False
Correct: B
  Prompt: Question: Are leaves usually purple?
A. True
B. False

Answer (A or B):...
  Raw response: ' A
Answer:
Answer: A

Explanation: Leaves are typically green, but some leaves may appear purple due to the presence of certain pigments. For example, some plants have a purple hue in their leaves, which is a result of the presence'
  Response repr: ' A\nAnswer:\nAnswer: A\n\nExplanation: Leaves are typically green, but some leaves may appear purple due to the presence of certain pigments. For example, some plants have a purple hue in their leaves, which is a result of the presence'
Predicted: A ❌

--- Question 2 ---
Q: Do birds have wings?
Choices: A. Yes | B. No
Correct: A
  Prompt: Question: Do birds have wings?
A. Yes
B. No

Answer (A or B):...
  Raw response: ' A
Answer:
Answer: A

Question: Do birds have wings?
A. Yes
B. No

Answer

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


RESULTS: Qwen3-0.6B (Binary Questions)

Parsed: 100/100
Failed to parse: 0/100

Accuracy: 79/100 (79.0%)

By Topic:
  colors: 38/50 (76.0%) | Failed: 0
  animals: 41/50 (82.0%) | Failed: 0


In [31]:
!pip install -q transformers accelerate torch

import json
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

# Load model
print("Loading Qwen3-0.6B...")
model_id = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
model.eval()
print("✅ Model loaded")

# Load questions
with open("easy_dataset.json", "r") as f:
    dataset = json.load(f)

questions = dataset["questions"]
print(f"Total questions: {len(questions)}")

# Evaluation function with debug
def ask_qwen(question, choices, debug=False):
    """Ask Qwen a multiple choice question."""

    choices_text = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)])

    # Simple prompt + disable thinking with /no_think
    prompt = f"""Question: {question}

{choices_text}

Reply with only one letter: A, B, C, or D. /no_think"""

    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response_clean = response.strip().upper()

    if debug:
        print(f"  Raw response: '{response}'")
        print(f"  Cleaned: '{response_clean}'")

    # Parse answer - check anywhere in response
    for i, letter in enumerate(["A", "B", "C", "D"]):
        if letter in response_clean:
            return i, response

    return -1, response

# Test on first 3 questions with debug
print("\n" + "="*50)
print("DEBUG: First 3 questions")
print("="*50)

for i, q in enumerate(questions[:3]):
    print(f"\nQ{i+1}: {q['question']}")
    print(f"Choices: {q['choices']}")
    print(f"Correct: {chr(65 + q['answer'])}")
    pred, raw = ask_qwen(q["question"], q["choices"], debug=True)
    print(f"Predicted: {chr(65 + pred) if pred != -1 else 'FAILED'}")

# Run full evaluation
print("\n" + "="*50)
print("Running full evaluation...")
print("="*50)

correct = 0
total = 0
failed = 0
results_by_topic = {}

for q in tqdm(questions, desc="Evaluating"):
    topic = q["topic"]
    prediction, raw = ask_qwen(q["question"], q["choices"])

    if topic not in results_by_topic:
        results_by_topic[topic] = {"correct": 0, "total": 0, "failed": 0}

    if prediction == -1:
        failed += 1
        results_by_topic[topic]["failed"] += 1
    else:
        total += 1
        results_by_topic[topic]["total"] += 1
        if prediction == q["answer"]:
            correct += 1
            results_by_topic[topic]["correct"] += 1

# Results
print("\n" + "="*50)
print("RESULTS: Qwen3-0.6B")
print("="*50)

if total > 0:
    print(f"\nOverall: {correct}/{total} ({100*correct/total:.1f}%)")
else:
    print(f"\nOverall: No valid responses!")

print(f"Failed to parse: {failed}/{len(questions)}")

print("\nBy Topic:")
for topic, res in results_by_topic.items():
    if res["total"] > 0:
        acc = res["correct"] / res["total"]
        print(f"  {topic}: {res['correct']}/{res['total']} ({100*acc:.1f}%) | Failed: {res['failed']}")
    else:
        print(f"  {topic}: No valid responses | Failed: {res['failed']}")

# Clear GPU
torch.cuda.empty_cache()

Device: cuda
Loading Qwen3-0.6B...
✅ Model loaded
Total questions: 100

DEBUG: First 3 questions

Q1: Are leaves usually purple?
Choices: ['True', 'False']
Correct: B
  Raw response: '<think>

</think>

A. True'
  Cleaned: '<THINK>

</THINK>

A. TRUE'
Predicted: A

Q2: Do birds have wings?
Choices: ['Yes', 'No']
Correct: A
  Raw response: '<think>

</think>

A'
  Cleaned: '<THINK>

</THINK>

A'
Predicted: A

Q3: Is water blue?
Choices: ['True', 'False']
Correct: A
  Raw response: '<think>

</think>

A. True'
  Cleaned: '<THINK>

</THINK>

A. TRUE'
Predicted: A

Running full evaluation...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


RESULTS: Qwen3-0.6B

Overall: 77/100 (77.0%)
Failed to parse: 0/100

By Topic:
  colors: 36/50 (72.0%) | Failed: 0
  animals: 41/50 (82.0%) | Failed: 0


In [33]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

SEED = 42
DEVICE = "cuda"

# Load dataset
with open("easy_dataset.json", "r") as f:
    dataset = json.load(f)

questions = dataset["questions"]
print(f"Total questions: {len(questions)}")

# Embed
print("\nEmbedding questions...")
embedder = SentenceTransformer("thenlper/gte-base", device=DEVICE)
texts = [q["question"] for q in questions]
embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True)
embeddings = embeddings.astype(np.float32)
print(f"Embeddings shape: {embeddings.shape}")

# Cluster (K=2 since we have 2 topics)
print("\nClustering...")
labels = KMeans(n_clusters=2, random_state=SEED, n_init=10).fit_predict(embeddings)
sil = silhouette_score(embeddings, labels, metric='cosine')
print(f"Silhouette: {sil:.4f}")

# Check cluster vs topic alignment
print("\n" + "="*50)
print("CLUSTER vs TOPIC")
print("="*50)
df = pd.DataFrame({
    "question": [q["question"] for q in questions],
    "topic": [q["topic"] for q in questions],
    "cluster": labels
})

for c in [0, 1]:
    cluster_df = df[df['cluster'] == c]
    topic_counts = cluster_df['topic'].value_counts()
    print(f"\nCluster {c} ({len(cluster_df)} questions):")
    for topic, count in topic_counts.items():
        pct = 100 * count / len(cluster_df)
        print(f"  {topic}: {count} ({pct:.0f}%)")

# Save centroids & embeddings
centroids = np.array([embeddings[labels == i].mean(axis=0) for i in range(2)])
np.save("centroids.npy", centroids)
np.save("embeddings.npy", embeddings)
np.save("labels.npy", labels)

# Save evaluation_data.json for profiling
evaluation_data = {
    "metadata": {
        "total_samples": len(questions),
        "topics": list(set(q["topic"] for q in questions)),
        "silhouette": float(sil),
        "n_clusters": 2
    },
    "clusters": {}
}

for cluster_id in [0, 1]:
    cluster_indices = [i for i in range(len(questions)) if labels[i] == cluster_id]
    cluster_samples = [questions[i] for i in cluster_indices]

    topics = [s["topic"] for s in cluster_samples]
    topic_counts = {t: topics.count(t) for t in set(topics)}

    evaluation_data["clusters"][str(cluster_id)] = {
        "n_samples": len(cluster_samples),
        "topic_distribution": topic_counts,
        "questions": [
            {
                "id": i,
                "question": s["question"],
                "choices": s["choices"],
                "answer": s["answer"],
                "topic": s["topic"]
            }
            for i, s in enumerate(cluster_samples)
        ]
    }

with open("evaluation_data.json", "w") as f:
    json.dump(evaluation_data, f, indent=2)

# Save clustered CSV
df.to_csv("clustered_data.csv", index=False)

print("\n" + "="*50)
print("SAVED FILES")
print("="*50)
print("  centroids.npy")
print("  embeddings.npy")
print("  labels.npy")
print("  evaluation_data.json")
print("  clustered_data.csv")
print(f"\n✅ Ready for model profiling!")


Total questions: 100

Embedding questions...


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: (100, 768)

Clustering...
Silhouette: 0.2086

CLUSTER vs TOPIC

Cluster 0 (48 questions):
  animals: 48 (100%)

Cluster 1 (52 questions):
  colors: 50 (96%)
  animals: 2 (4%)

SAVED FILES
  centroids.npy
  embeddings.npy
  labels.npy
  evaluation_data.json
  clustered_data.csv

✅ Ready for model profiling!


# MERGING CLUSTERS

In [38]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score

# ============================================================
# DEFINE PATHS FOR BOTH CLUSTERS
# ============================================================

# First cluster (MMLU)
MMLU_PATH = {
    "embeddings": "/content/clusters_mmlu/embeddings.npy",
    "labels": "/content/clusters_mmlu/labels.npy",
    "centroids": "/content/clusters_mmlu/centroids.npy",
    "data": "/content/clusters_mmlu/mmlu_clustered.csv"
}

# Second cluster (Easy dataset)
EASY_PATH = {
    "embeddings": "/content/clusters_easy/embeddings.npy",
    "labels": "/content/clusters_easy/labels.npy",
    "centroids": "/content/clusters_easy/centroids.npy",
    "data": "/content/clusters_easy/easy_dataset.json"
}

print("Paths configured:")
print(f"  MMLU: {MMLU_PATH['embeddings']}")
print(f"  Easy: {EASY_PATH['embeddings']}")

Paths configured:
  MMLU: /content/clusters_mmlu/embeddings.npy
  Easy: /content/clusters_easy/embeddings.npy


In [39]:
# ============================================================
# LOAD MMLU CLUSTER
# ============================================================
print("Loading MMLU cluster...")

mmlu_embeddings = np.load(MMLU_PATH["embeddings"])
mmlu_labels = np.load(MMLU_PATH["labels"])
mmlu_centroids = np.load(MMLU_PATH["centroids"])
mmlu_n_clusters = len(mmlu_centroids)

# Load MMLU questions
mmlu_df = pd.read_csv(MMLU_PATH["data"])
mmlu_questions = mmlu_df.to_dict('records')

print(f"  Embeddings: {mmlu_embeddings.shape}")
print(f"  Clusters: {mmlu_n_clusters}")
print(f"  Questions: {len(mmlu_questions)}")

# ============================================================
# LOAD EASY CLUSTER
# ============================================================
print("\nLoading Easy cluster...")

easy_embeddings = np.load(EASY_PATH["embeddings"])
easy_labels = np.load(EASY_PATH["labels"])
easy_centroids = np.load(EASY_PATH["centroids"])
easy_n_clusters = len(easy_centroids)

# Load Easy questions
with open(EASY_PATH["data"], "r") as f:
    easy_data = json.load(f)
easy_questions = easy_data["questions"]

print(f"  Embeddings: {easy_embeddings.shape}")
print(f"  Clusters: {easy_n_clusters}")
print(f"  Questions: {len(easy_questions)}")

Loading MMLU cluster...
  Embeddings: (1351, 768)
  Clusters: 7
  Questions: 1351

Loading Easy cluster...
  Embeddings: (100, 768)
  Clusters: 2
  Questions: 100


In [40]:
# ============================================================
# MERGE EMBEDDINGS
# ============================================================
print("Merging clusters...")

# Combine embeddings
combined_embeddings = np.vstack([mmlu_embeddings, easy_embeddings])
print(f"Combined embeddings: {combined_embeddings.shape}")

# Offset easy labels to avoid overlap with MMLU labels
# MMLU: 0, 1, 2, ... (mmlu_n_clusters - 1)
# Easy: mmlu_n_clusters, mmlu_n_clusters + 1, ...
easy_labels_offset = easy_labels + mmlu_n_clusters
combined_labels = np.concatenate([mmlu_labels, easy_labels_offset])
print(f"Combined labels: {combined_labels.shape}")

# Combine centroids
combined_centroids = np.vstack([mmlu_centroids, easy_centroids])
total_clusters = mmlu_n_clusters + easy_n_clusters
print(f"Total clusters: {total_clusters}")

# ============================================================
# COMBINE QUESTIONS
# ============================================================
combined_questions = []

# Add MMLU questions
for i, q in enumerate(mmlu_questions):
    combined_questions.append({
        "question": q.get("question", ""),
        "subject": q.get("subject", "mmlu"),
        "answer": q.get("answer", -1),
        "cluster": int(mmlu_labels[i]),
        "source": "mmlu"
    })

# Add Easy questions
for i, q in enumerate(easy_questions):
    combined_questions.append({
        "question": q.get("question", ""),
        "subject": q.get("topic", "easy"),
        "answer": q.get("answer", -1),
        "cluster": int(easy_labels_offset[i]),
        "source": "easy"
    })

print(f"Combined questions: {len(combined_questions)}")

Merging clusters...
Combined embeddings: (1451, 768)
Combined labels: (1451,)
Total clusters: 9
Combined questions: 1451


In [44]:
# ============================================================
# CALCULATE SILHOUETTE SCORE (COMBINED: MMLU + EASY)
# ============================================================
print("\n" + "="*60)
print("SILHOUETTE SCORE (Combined: MMLU + Easy)")
print("="*60)

# Filter out noise (-1) from the combined set
# MMLU contains noise (-1), Easy does not.
mask = combined_labels != -1
valid_combined_embeddings = combined_embeddings[mask]
valid_combined_labels = combined_labels[mask]

print(f"Total combined samples: {len(combined_labels)}")
print(f"Valid samples (excluding noise): {len(valid_combined_labels)}")
print(f"Number of clusters: {len(np.unique(valid_combined_labels))}")

if len(valid_combined_labels) > 0 and len(np.unique(valid_combined_labels)) > 1:
    # Calculate silhouette score for the combined valid data
    combined_sil = silhouette_score(valid_combined_embeddings, valid_combined_labels, metric='cosine')

    print(f"\nSilhouette Score (Combined): {combined_sil:.4f}")

    # Verdict
    print("\n" + "="*60)
    if combined_sil >= 0.2:
        print(f"✅ Combined silhouette {combined_sil:.4f} (Reasonable separation)")
    elif combined_sil >= 0.1:
        print(f"⚠️ Combined silhouette {combined_sil:.4f} (Weak separation)")
    else:
        print(f"❌ Combined silhouette {combined_sil:.4f} (Significant overlap)")
else:
    print("\n⚠️ Not enough clusters/samples to calculate silhouette score.")


SILHOUETTE SCORE (Combined: MMLU + Easy)
Total combined samples: 1451
Valid samples (excluding noise): 367
Number of clusters: 9

Silhouette Score (Combined): 0.2274

✅ Combined silhouette 0.2274 (Reasonable separation)


In [45]:
import numpy as np
import json

# 1. Save Numpy Arrays
print("Saving numpy arrays...")
np.save("centroids.npy", combined_centroids)
np.save("embeddings.npy", combined_embeddings)
np.save("labels.npy", combined_labels)

# 2. Construct evaluation_data.json
print("Constructing evaluation_data.json...")

evaluation_data = {
    "metadata": {
        "total_samples": len(combined_labels),
        "n_clusters": int(total_clusters),
        # Use the silhouette score if it was calculated in the previous step
        "silhouette": float(combined_sil) if 'combined_sil' in locals() else None
    },
    "clusters": {}
}

# Group questions by cluster
cluster_map = {}

for q in combined_questions:
    c_id = str(q['cluster'])
    if c_id not in cluster_map:
        cluster_map[c_id] = []
    cluster_map[c_id].append(q)

# Populate cluster details
for c_id, q_list in cluster_map.items():
    # Calculate subject distribution
    subjects = [q['subject'] for q in q_list]
    subject_counts = {s: subjects.count(s) for s in set(subjects)}

    evaluation_data["clusters"][c_id] = {
        "n_samples": len(q_list),
        "subject_distribution": subject_counts,
        "questions": q_list
    }

# 3. Save JSON
with open("evaluation_data.json", "w") as f:
    json.dump(evaluation_data, f, indent=2)

print("\n✅ All files saved successfully:")
print("  - centroids.npy")
print("  - embeddings.npy")
print("  - labels.npy")
print("  - evaluation_data.json")

Saving numpy arrays...
Constructing evaluation_data.json...

✅ All files saved successfully:
  - centroids.npy
  - embeddings.npy
  - labels.npy
  - evaluation_data.json


In [46]:
import json
import os
import pandas as pd

# Paths to source files
combined_path = "evaluation_data.json"

# MMLU sources
mmlu_json_path = "/content/clusters_mmlu/evaluation_data.json"
mmlu_csv_path = "/content/clusters_mmlu/mmlu_clustered.csv"

# Easy sources
easy_json_path = "/content/clusters_easy/evaluation_data.json"
easy_dataset_path = "/content/easy_dataset.json"

print("Loading combined data...")
with open(combined_path, 'r') as f:
    combined_data = json.load(f)

# Lookup dictionary: (source, question_text) -> choices_list
question_choices_map = {}

# =============================================================================
# 1. LOAD MMLU CHOICES
# =============================================================================
if os.path.exists(mmlu_json_path):
    print(f"Loading MMLU choices from: {mmlu_json_path}")
    with open(mmlu_json_path, 'r') as f:
        data = json.load(f)
        for cid, cdata in data.get("clusters", {}).items():
            for q in cdata.get("questions", []):
                if "choices" in q:
                    question_choices_map[("mmlu", q["question"])] = q["choices"]
elif os.path.exists(mmlu_csv_path):
    print(f"Loading MMLU choices from: {mmlu_csv_path}")
    df = pd.read_csv(mmlu_csv_path)
    # Assuming standard MMLU columns A, B, C, D exist
    for _, row in df.iterrows():
        q_text = row.get("question", "")
        choices = [
            str(row.get("A", "")),
            str(row.get("B", "")),
            str(row.get("C", "")),
            str(row.get("D", ""))
        ]
        question_choices_map[("mmlu", q_text)] = choices
else:
    print("⚠️ No MMLU source found!")

# =============================================================================
# 2. LOAD EASY CHOICES
# =============================================================================
if os.path.exists(easy_json_path):
    print(f"Loading Easy choices from: {easy_json_path}")
    with open(easy_json_path, 'r') as f:
        data = json.load(f)
        for cid, cdata in data.get("clusters", {}).items():
            for q in cdata.get("questions", []):
                if "choices" in q:
                    question_choices_map[("easy", q["question"])] = q["choices"]
elif os.path.exists(easy_dataset_path):
    print(f"Loading Easy choices from: {easy_dataset_path}")
    with open(easy_dataset_path, 'r') as f:
        data = json.load(f)
        # easy_dataset.json usually has a list under "questions"
        qs = data.get("questions", [])
        for q in qs:
            if "choices" in q:
                question_choices_map[("easy", q["question"])] = q["choices"]
else:
    print("⚠️ No Easy source found!")

# =============================================================================
# 3. UPDATE COMBINED DATA
# =============================================================================
print("\nUpdating combined data...")
updated_count = 0
missing_count = 0

for cid, cdata in combined_data["clusters"].items():
    for q in cdata["questions"]:
        source = q.get("source", "unknown")
        q_text = q.get("question")

        key = (source, q_text)

        if key in question_choices_map:
            q["choices"] = question_choices_map[key]
            updated_count += 1
        else:
            missing_count += 1
            # Initialize empty if not found
            if "choices" not in q:
                q["choices"] = []

# Save back
with open(combined_path, 'w') as f:
    json.dump(combined_data, f, indent=2)

print(f"✅ Updated {updated_count} questions with choices.")
if missing_count > 0:
    print(f"⚠️ Missing choices for {missing_count} questions.")
else:
    print("✅ All questions have choices.")

# Verify one random entry
import random
if combined_data["clusters"]:
    rand_cid = random.choice(list(combined_data["clusters"].keys()))
    if combined_data["clusters"][rand_cid]["questions"]:
        q_ex = combined_data["clusters"][rand_cid]["questions"][0]
        print(f"\nExample (Cluster {rand_cid}):")
        print(f"Q: {q_ex.get('question')}")
        print(f"Choices: {q_ex.get('choices')}")

Loading combined data...
Loading MMLU choices from: /content/clusters_mmlu/evaluation_data.json
Loading Easy choices from: /content/clusters_easy/evaluation_data.json

Updating combined data...
✅ Updated 367 questions with choices.
⚠️ Missing choices for 1084 questions.

Example (Cluster 8):
Q: Are leaves usually purple?
Choices: ['True', 'False']


In [47]:
import json
import numpy as np

print("Removing noise cluster (-1)...")

# 1. Update evaluation_data.json
json_path = "evaluation_data.json"
with open(json_path, 'r') as f:
    data = json.load(f)

initial_count = data["metadata"]["total_samples"]

if "-1" in data["clusters"]:
    noise_samples = data["clusters"].pop("-1")
    n_removed = noise_samples["n_samples"]
    print(f"  Removed cluster -1 containing {n_removed} samples.")

    # Update metadata
    data["metadata"]["total_samples"] -= n_removed

    with open(json_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"  ✅ Updated {json_path} (Total samples: {data['metadata']['total_samples']})")
else:
    print("  ℹ️ Cluster -1 not found in JSON.")

# 2. Update Numpy Arrays
print("\nUpdating numpy arrays...")
try:
    embeddings = np.load("embeddings.npy")
    labels = np.load("labels.npy")

    # Create mask for non-noise labels
    mask = labels != -1

    if np.sum(~mask) > 0:
        filtered_embeddings = embeddings[mask]
        filtered_labels = labels[mask]

        np.save("embeddings.npy", filtered_embeddings)
        np.save("labels.npy", filtered_labels)

        print(f"  Original shape: {embeddings.shape}")
        print(f"  Filtered shape: {filtered_embeddings.shape}")
        print("  ✅ Updated embeddings.npy and labels.npy")
    else:
        print("  ℹ️ No noise labels (-1) found in numpy arrays.")

except Exception as e:
    print(f"  ❌ Error updating numpy arrays: {e}")

print("\nDone.")

Removing noise cluster (-1)...
  Removed cluster -1 containing 1084 samples.
  ✅ Updated evaluation_data.json (Total samples: 367)

Updating numpy arrays...
  Original shape: (1451, 768)
  Filtered shape: (367, 768)
  ✅ Updated embeddings.npy and labels.npy

Done.
